diff --git a/src/rights_filter/server/sqlite_store.py b/src/rights_filter/server/sqlite_store.py index fdf21de..b2d4090 100644 --- a/src/rights_filter/server/sqlite_store.py +++ b/src/rights_filter/server/sqlite_store.py @@ -45,6 +45,13 @@ from rights_filter.integrations.env_clients import ProviderRuntime, build_provid from rights_filter.integrations.external_policy import ExternalApiPolicy from rights_filter.jobs.batch_analyzer import BatchAnalyzer, SubmissionImage from rights_filter.server.image_store import LocalSubmissionImageStore, SUPPORTED_IMAGE_SUFFIXES +from rights_filter.server.store_url_utils import ( + _decoded_nested_url, + _is_http_url, + _url_has_image_format_hint, + _url_looks_like_image, + _url_path_has_image_suffix, +) EVIDENCE_OPERATOR_STATUSES = { @@ -4947,44 +4954,6 @@ def _decoded_url_reference(value: str) -> str: return raw -def _decoded_nested_url(value: str) -> str: - candidate = str(value).strip() - for _ in range(3): - decoded = unquote(candidate).strip() - if decoded == candidate: - break - candidate = decoded - return candidate - - -def _is_http_url(url: str) -> bool: - parsed = urlparse(url) - return parsed.scheme in {"http", "https"} and bool(parsed.netloc) - - -def _url_path_has_image_suffix(url: str) -> bool: - return Path(urlparse(url).path).suffix.lower() in SUPPORTED_IMAGE_SUFFIXES - - -def _url_has_image_format_hint(url: str) -> bool: - image_formats = {suffix.lstrip(".") for suffix in SUPPORTED_IMAGE_SUFFIXES} - image_format_keys = {"format", "fm", "ext", "extension", "mime", "output", "type"} - for key, hint in parse_qsl(urlparse(url).query, keep_blank_values=False): - if key.lower().replace("-", "_") not in image_format_keys: - continue - normalized = hint.lower().split(";", 1)[0].strip().lstrip(".") - if normalized.startswith("image/"): - normalized = normalized.split("/", 1)[1] - normalized = normalized.split("+", 1)[0] - if normalized in image_formats: - return True - return False - - -def _url_looks_like_image(url: str) -> bool: - return _url_path_has_image_suffix(url) or _url_has_image_format_hint(url) - - def _submission_payload( record: dict[str, Any], score: int, diff --git a/src/rights_filter/server/store_url_utils.py b/src/rights_filter/server/store_url_utils.py new file mode 100644 index 0000000..c020876 --- /dev/null +++ b/src/rights_filter/server/store_url_utils.py @@ -0,0 +1,52 @@ +"""URL helpers used by the SQLite store's evidence/image handling. + +Extracted from sqlite_store.py to keep that module focused. Behavior is +unchanged. Note these intentionally treat the local-submission suffix set +(SUPPORTED_IMAGE_SUFFIXES, which includes .svg) as image-like; the integration +adapters keep their own stricter suffix policy for external results. +""" + +from __future__ import annotations + +from pathlib import Path +from urllib.parse import parse_qsl, unquote, urlparse + +from rights_filter.server.image_store import SUPPORTED_IMAGE_SUFFIXES + + +def _decoded_nested_url(value: str) -> str: + candidate = str(value).strip() + for _ in range(3): + decoded = unquote(candidate).strip() + if decoded == candidate: + break + candidate = decoded + return candidate + + +def _is_http_url(url: str) -> bool: + parsed = urlparse(url) + return parsed.scheme in {"http", "https"} and bool(parsed.netloc) + + +def _url_path_has_image_suffix(url: str) -> bool: + return Path(urlparse(url).path).suffix.lower() in SUPPORTED_IMAGE_SUFFIXES + + +def _url_has_image_format_hint(url: str) -> bool: + image_formats = {suffix.lstrip(".") for suffix in SUPPORTED_IMAGE_SUFFIXES} + image_format_keys = {"format", "fm", "ext", "extension", "mime", "output", "type"} + for key, hint in parse_qsl(urlparse(url).query, keep_blank_values=False): + if key.lower().replace("-", "_") not in image_format_keys: + continue + normalized = hint.lower().split(";", 1)[0].strip().lstrip(".") + if normalized.startswith("image/"): + normalized = normalized.split("/", 1)[1] + normalized = normalized.split("+", 1)[0] + if normalized in image_formats: + return True + return False + + +def _url_looks_like_image(url: str) -> bool: + return _url_path_has_image_suffix(url) or _url_has_image_format_hint(url)