refactor: extract text helpers and HTML/CSS image-scraping from sqlite_store

Move the pure text helpers (_text_list, _unique_texts) into store_text and the ~950-line page/CSS/JSON/srcset image-URL extraction (the _PageImageParser and its helpers) into store_page_scrape. Both behavior-preserving; store_page_scrape depends only on stdlib + url/text helpers + domain Evidence (no store coupling). sqlite_store.py 4955 -> 3992 lines.
2026-06-20 21:10:22 +09:00 · 2026-06-20 21:10:22 +09:00 · e3bc99e6b9
commit e3bc99e6b9
parent bd35cf6f3f
4 changed files with 1015 additions and 974 deletions
--- a/src/rights_filter/server/sqlite_store.py
+++ b/src/rights_filter/server/sqlite_store.py
--- a/src/rights_filter/server/store_page_scrape.py
+++ b/src/rights_filter/server/store_page_scrape.py
@ -0,0 +1,976 @@
 """HTML/CSS/JSON image-URL extraction for external search-result pages.
 Extracted from sqlite_store.py (the ~950-line "URL scraping" responsibility the
 architecture review flagged as not belonging in a persistence store). Pure
 parsing/normalization of fetched page content; behavior unchanged. Depends only
 on stdlib, the URL/text helpers, and domain Evidence — never on the store class.
 """
 from __future__ import annotations
 import html
 import json
 import re
 from html.parser import HTMLParser
 from typing import Any
 from urllib.parse import parse_qsl, urljoin, urlparse
 from rights_filter.analysis.fingerprints import FingerprintService
 from rights_filter.domain.records import Evidence
 from rights_filter.server.store_text import _unique_texts
 from rights_filter.server.store_url_utils import (
    _decoded_nested_url,
    _is_http_url,
    _url_looks_like_image,
 )
 class _PageImageParser(HTMLParser):
    def __init__(self, parse_noscript: bool = True) -> None:
        super().__init__()
        self.parse_noscript = parse_noscript
        self.priority_urls: list[str] = []
        self.image_urls: list[str] = []
        self.stylesheet_urls: list[str] = []
        self._script_chunks: list[str] = []
        self._collecting_script_type = ""
        self._style_chunks: list[str] = []
        self._collecting_style = False
        self._noscript_chunks: list[str] = []
        self._collecting_noscript = False
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        attr = {name.lower(): (value or "") for name, value in attrs}
        tag_name = tag.lower()
        if tag_name == "noscript" and self.parse_noscript:
            self._collecting_noscript = True
            self._noscript_chunks = []
            return
        if tag_name == "style":
            self._collecting_style = True
            self._style_chunks = []
            return
        if tag_name == "script":
            script_type = attr.get("type", "").lower()
            if "ld+json" in script_type:
                self._collecting_script_type = "json_ld"
                self._script_chunks = []
            elif script_type in {"application/json", "application/problem+json", "application/activity+json"}:
                self._collecting_script_type = "json"
                self._script_chunks = []
            elif script_type in {"", "text/javascript", "application/javascript", "module"}:
                self._collecting_script_type = "javascript"
                self._script_chunks = []
            return
        if attr.get("style"):
            self.image_urls.extend(_css_url_image_urls(attr["style"]))
        if attr.get("poster") and _looks_like_image_reference(attr["poster"]):
            self.priority_urls.append(attr["poster"])
        for name in (
            "data-bg",
            "data-bg-url",
            "data-background",
            "data-background-image",
            "data-background-url",
            "data-lazy-background",
            "data-image",
            "data-image-src",
            "data-image-url",
            "data-img",
            "data-img-src",
            "data-img-url",
            "data-src",
            "data-src-large",
            "data-fallback-src",
            "data-full",
            "data-full-src",
            "data-full-image",
            "data-full-url",
            "data-hires",
            "data-highres",
            "data-large",
            "data-large-src",
            "data-large-image",
            "data-large-url",
            "data-original",
            "data-original-image",
            "data-lazy-src",
            "data-original-src",
            "data-original-url",
            "data-photo",
            "data-photo-url",
            "data-thumb",
            "data-thumb-url",
            "data-thumbnail",
            "data-thumbnail-url",
            "data-zoom-image",
        ):
            if attr.get(name):
                self.image_urls.extend(
                    _data_attribute_image_urls(name, attr[name], known_image_attr=True)
                )
        for name, value in attr.items():
            self.image_urls.extend(_data_attribute_image_urls(name, value))
        if tag_name in {"a", "area"} and attr.get("href") and _looks_like_image_reference(attr["href"]):
            self.image_urls.append(attr["href"])
        if tag_name == "meta":
            key = (attr.get("property") or attr.get("name") or attr.get("itemprop") or "").lower()
            if key in {
                "image",
                "contenturl",
                "thumbnail",
                "thumbnailurl",
                "og:image",
                "og:image:url",
                "og:image:secure_url",
                "twitter:image",
                "twitter:image:src",
                "twitter:image:url",
            }:
                self.priority_urls.append(attr.get("content", ""))
            return
        if tag_name == "link":
            rel = attr.get("rel", "").lower()
            rel_key = rel.replace("-", "_")
            as_value = attr.get("as", "").lower()
            if "stylesheet" in rel and attr.get("href"):
                self.stylesheet_urls.append(attr["href"])
            if "image_src" in rel_key or (as_value == "image" and any(token in rel for token in ("preload", "prefetch"))):
                if attr.get("href"):
                    self.priority_urls.append(attr["href"])
                for name in ("imagesrcset", "image-srcset"):
                    if attr.get(name):
                        self.priority_urls.extend(_srcset_image_urls(attr[name]))
            return
        if tag_name == "source":
            for name in ("srcset", "data-original-srcset", "data-lazy-srcset", "data-srcset"):
                if attr.get(name):
                    self.priority_urls.extend(_srcset_image_urls(attr[name]))
            return
        if tag_name in {"img", "amp-img", "amp-anim"}:
            target_urls = self.priority_urls if _is_likely_primary_image_attrs(attr) else self.image_urls
            for name in ("data-original-srcset", "data-lazy-srcset", "data-srcset", "srcset"):
                if attr.get(name):
                    target_urls.extend(_srcset_image_urls(attr[name]))
            for name in ("data-original", "data-original-src", "data-lazy-src", "data-src", "src"):
                if attr.get(name):
                    target_urls.append(attr[name])
                    return
    def handle_data(self, data: str) -> None:
        if self._collecting_script_type:
            self._script_chunks.append(data)
        if self._collecting_style:
            self._style_chunks.append(data)
        if self._collecting_noscript:
            self._noscript_chunks.append(data)
    def handle_endtag(self, tag: str) -> None:
        tag_name = tag.lower()
        if tag_name == "noscript" and self._collecting_noscript:
            self._collecting_noscript = False
            parser = _PageImageParser(parse_noscript=False)
            parser.feed(html.unescape("".join(self._noscript_chunks)))
            self.priority_urls.extend(parser.priority_urls)
            self.image_urls.extend(parser.image_urls)
            self.stylesheet_urls.extend(parser.stylesheet_urls)
            self._noscript_chunks = []
            return
        if tag_name == "style" and self._collecting_style:
            self._collecting_style = False
            self.image_urls.extend(_css_url_image_urls("".join(self._style_chunks)))
            self._style_chunks = []
            return
        if tag_name != "script" or not self._collecting_script_type:
            return
        script_type = self._collecting_script_type
        self._collecting_script_type = ""
        script_content = "".join(self._script_chunks)
        self._script_chunks = []
        if script_type == "json_ld":
            self.priority_urls.extend(_json_ld_image_urls(script_content))
        elif script_type == "json":
            self.priority_urls.extend(_json_script_image_urls(script_content))
        elif script_type == "javascript":
            self.priority_urls.extend(_javascript_image_urls(script_content))
 def _srcset_image_urls(value: str) -> list[str]:
    candidates: list[tuple[float, int, str]] = []
    for order, raw_candidate in enumerate(_split_srcset_candidates(str(value))):
        candidate = raw_candidate.strip()
        if not candidate:
            continue
        parts = candidate.split()
        url = parts[0].strip()
        if not url:
            continue
        score = _srcset_descriptor_score(parts[1] if len(parts) > 1 else "")
        candidates.append((score, order, url))
    return [
        url
        for _, _, url in sorted(
            candidates,
            key=lambda item: (-item[0], item[1]),
        )
    ]
 def _split_srcset_candidates(value: str) -> list[str]:
    candidates: list[str] = []
    start = 0
    for index, character in enumerate(value):
        if character != ",":
            continue
        remainder = value[index + 1 :].lstrip()
        if not _starts_srcset_candidate(remainder):
            continue
        candidates.append(value[start:index])
        start = index + 1
    candidates.append(value[start:])
    return candidates
 def _starts_srcset_candidate(value: str) -> bool:
    text = str(value).strip()
    if not text:
        return False
    first_token = text.split(None, 1)[0]
    return _is_urlish_reference(first_token) or _is_scheme_less_remote_image_url(first_token)
 def _srcset_descriptor_score(value: str) -> float:
    descriptor = value.strip().lower()
    if descriptor.endswith("w"):
        try:
            return float(descriptor[:-1])
        except ValueError:
            return 0.0
    if descriptor.endswith("x"):
        try:
            return float(descriptor[:-1]) * 1000
        except ValueError:
            return 0.0
    return 0.0
 def _is_generic_data_image_attr(name: str, value: str) -> bool:
    return bool(_data_attribute_image_urls(name, value))
 def _data_attribute_image_urls(
    name: str,
    value: str,
    *,
    known_image_attr: bool = False,
 ) -> list[str]:
    attr_name = str(name).lower().replace("-", "_").replace(":", "_")
    text = html.unescape(str(value).strip())
    if not attr_name.startswith("data_") or not text or text.lower().startswith("data:"):
        return []
    if _is_srcset_attr_name(attr_name):
        return _srcset_image_urls(text)
    if _looks_like_image_reference(text):
        return [text]
    if _is_urlish_reference(text):
        return [text]
    image_named_attr = _is_image_data_attr_name(attr_name)
    if not known_image_attr and not image_named_attr and not _looks_like_json_attribute_text(text):
        return []
    return _json_attribute_image_urls(
        text,
        allow_plain_url_keys=known_image_attr or image_named_attr,
    )
 def _is_image_data_attr_name(attr_name: str) -> bool:
    image_tokens = (
        "avatar",
        "background",
        "bg",
        "image",
        "img",
        "photo",
        "picture",
        "poster",
        "thumb",
        "thumbnail",
    )
    return any(token in attr_name for token in image_tokens)
 def _json_attribute_image_urls(
    value: str,
    *,
    allow_plain_url_keys: bool = True,
 ) -> list[str]:
    text = _json_attribute_text(value)
    if not text:
        return []
    try:
        document = json.loads(text)
    except Exception:
        return []
    urls: list[str] = []
    def collect_likely(value: Any) -> None:
        if isinstance(value, str):
            urls.extend(_json_image_string_candidates(value))
            return
        if isinstance(value, list):
            for item in value:
                collect_likely(item)
            return
        if isinstance(value, dict):
            for key, child in value.items():
                if _is_srcset_key(str(key)):
                    urls.extend(_srcset_image_urls(str(child)))
                    continue
                if _is_json_url_value_key(str(key)) or _is_likely_json_image_key(str(key)):
                    collect_likely(child)
                elif isinstance(child, (dict, list)):
                    collect_likely(child)
    def collect_obvious(value: Any) -> None:
        if isinstance(value, str):
            return
        if isinstance(value, list):
            for item in value:
                collect_obvious(item)
            return
        if isinstance(value, dict):
            for key, child in value.items():
                if _is_srcset_key(str(key)):
                    urls.extend(_srcset_image_urls(str(child)))
                    continue
                if _is_likely_json_image_key(str(key)):
                    collect_likely(child)
                else:
                    collect_obvious(child)
    if allow_plain_url_keys:
        collect_likely(document)
    else:
        collect_obvious(document)
    return _unique_texts(urls)
 def _looks_like_json_attribute_text(value: str) -> bool:
    return bool(_json_attribute_text(value))
 def _json_attribute_text(value: str) -> str:
    text = html.unescape(str(value).strip())
    if text.startswith(("{", "[")):
        return text
    decoded = _decoded_nested_url(text)
    if decoded and decoded.startswith(("{", "[")):
        return decoded
    return ""
 def _is_srcset_attr_name(name: str) -> bool:
    normalized = str(name).lower().replace("-", "_").replace(":", "_")
    return "srcset" in normalized or "src_set" in normalized
 def _is_urlish_reference(value: str) -> bool:
    text = str(value).strip()
    return (
        _is_http_url(text)
        or text.startswith(("/", "//", "./", "../"))
        or _url_looks_like_image(text)
    )
 def _json_ld_image_urls(script_content: str) -> list[str]:
    try:
        document = json.loads(script_content)
    except Exception:
        return []
    urls: list[str] = []
    def collect_image_value(value: Any) -> None:
        if isinstance(value, str):
            urls.append(value)
            return
        if isinstance(value, list):
            for item in value:
                collect_image_value(item)
            return
        if isinstance(value, dict):
            for key in ("contentUrl", "url", "thumbnailUrl"):
                if key in value:
                    collect_image_value(value[key])
    def visit(value: Any) -> None:
        if isinstance(value, list):
            for item in value:
                visit(item)
            return
        if not isinstance(value, dict):
            return
        for key, child in value.items():
            if str(key).lower() in {
                "image",
                "thumbnail",
                "thumbnailurl",
                "contenturl",
                "primaryimageofpage",
                "associatedmedia",
            }:
                collect_image_value(child)
            else:
                visit(child)
    visit(document)
    return urls
 def _json_script_image_urls(script_content: str) -> list[str]:
    try:
        document = json.loads(script_content)
    except Exception:
        return []
    urls: list[str] = []
    def collect(value: Any) -> None:
        if isinstance(value, str):
            urls.extend(_json_image_string_candidates(value))
            return
        if isinstance(value, list):
            for item in value:
                collect(item)
            return
        if isinstance(value, dict):
            for key, child in value.items():
                if _is_srcset_key(str(key)):
                    urls.extend(_srcset_image_urls(str(child)))
                    continue
                if _is_json_url_value_key(str(key)) or _is_likely_json_image_key(str(key)):
                    collect(child)
                elif isinstance(child, (dict, list)):
                    collect(child)
            return
    def visit(value: Any) -> None:
        if isinstance(value, list):
            for item in value:
                visit(item)
            return
        if not isinstance(value, dict):
            return
        for key, child in value.items():
            if _is_srcset_key(str(key)):
                urls.extend(_srcset_image_urls(str(child)))
            elif _is_likely_json_image_key(str(key)):
                collect(child)
            else:
                visit(child)
    visit(document)
    return urls
 def _is_likely_json_image_key(key: str) -> bool:
    normalized = key.lower().replace("-", "_")
    return (
        "image" in normalized
        or "thumbnail" in normalized
        or _is_srcset_key(normalized)
        or normalized in {
            "asset_url",
            "content_url",
            "contenturl",
            "media_url",
            "mediaurl",
            "photo",
            "photo_url",
            "poster",
            "poster_url",
            "avatar",
            "avatar_url",
        }
    )
 def _is_json_url_value_key(key: str) -> bool:
    normalized = re.sub(r"(?<!^)(?=[A-Z])", "_", str(key)).lower().replace("-", "_")
    normalized = normalized.replace("__", "_")
    return normalized in {
        "content_url",
        "contenturl",
        "download_url",
        "file_url",
        "href",
        "original_url",
        "public_url",
        "secure_url",
        "src",
        "thumbnail_url",
        "thumbnailurl",
        "url",
    }
 def _json_image_string_candidates(value: str) -> list[str]:
    raw = str(value).strip()
    decoded = _decoded_nested_url(raw)
    if decoded != raw and (_looks_like_image_reference(decoded) or _is_urlish_reference(decoded)):
        return [decoded]
    if _looks_like_image_reference(raw) or _is_urlish_reference(raw):
        return [raw]
    return []
 def _is_srcset_key(key: str) -> bool:
    normalized = str(key).lower().replace("-", "_")
    return normalized in {
        "image_src_set",
        "image_srcset",
        "imagesrcset",
        "photo_src_set",
        "photo_srcset",
        "picture_src_set",
        "picture_srcset",
        "src_set",
        "srcset",
        "thumbnail_src_set",
        "thumbnail_srcset",
    }
 def _javascript_image_urls(script_content: str) -> list[str]:
    urls: list[str] = []
    image_key = r"[\w$:-]*(?:image|thumbnail|photo|avatar|poster|picture)[\w$:-]*"
    srcset_key = r"[\w$:-]*(?:srcset|src-set)[\w$:-]*"
    srcset_pattern = re.compile(
        rf"""(?is)["']?({srcset_key})["']?\s*[:=]\s*["']([^"']+)["']"""
    )
    key_value_pattern = re.compile(
        rf"""(?is)["']?({image_key})["']?\s*[:=]\s*["']([^"']+)["']"""
    )
    nested_value_pattern = re.compile(
        rf"""(?is)["']?({image_key})["']?\s*[:=]\s*\{{[^{{}}]{{0,500}}?["'](?:url|src|contentUrl|thumbnailUrl)["']\s*:\s*["']([^"']+)["']"""
    )
    for _key, value in srcset_pattern.findall(script_content):
        urls.extend(_srcset_image_urls(_decode_javascript_string(value)))
    for pattern in (key_value_pattern, nested_value_pattern):
        for _key, value in pattern.findall(script_content):
            candidate = _decode_javascript_string(value)
            if _looks_like_image_reference(candidate):
                urls.append(candidate)
    return _unique_texts(urls)
 def _decode_javascript_string(value: str) -> str:
    text = value.replace("\\/", "/")
    if "\\u" not in text and "\\x" not in text:
        return text
    def _replace_escape(match: re.Match[str]) -> str:
        try:
            return chr(int(match.group(1) or match.group(2), 16))
        except (TypeError, ValueError):
            return match.group(0)
    # Decode only explicit \uXXXX / \xXX escapes. The previous
    # bytes(text, "utf-8").decode("unicode_escape") reinterpreted real UTF-8
    # bytes as Latin-1, silently corrupting literal non-ASCII (e.g. Korean) URLs.
    return re.sub(r"\\u([0-9a-fA-F]{4})|\\x([0-9a-fA-F]{2})", _replace_escape, text)
 def _looks_like_image_reference(value: str) -> bool:
    text = value.strip()
    if not text or text.lower().startswith("data:"):
        return False
    if _unwrapped_image_url(text):
        return True
    if _relative_wrapped_image_url(text):
        return True
    return _url_looks_like_image(text)
 def _relative_wrapped_image_url(value: str) -> str:
    parsed = urlparse(value)
    if parsed.scheme or parsed.netloc:
        return ""
    for key, raw_value in parse_qsl(parsed.query, keep_blank_values=False):
        key_text = key.lower().replace("-", "_")
        if key_text not in {
            "imgurl",
            "imageurl",
            "image_url",
            "mediaurl",
            "media_url",
            "contenturl",
            "content_url",
            "photo",
            "photo_url",
            "src",
            "source",
            "image",
            "img",
            "url",
            "u",
        }:
            continue
        candidate = _decoded_nested_url(raw_value)
        if candidate.startswith("/") or _url_looks_like_image(candidate):
            return candidate
    return ""
 def _is_likely_primary_image_attrs(attr: dict[str, str]) -> bool:
    text = " ".join(
        str(attr.get(name, ""))
        for name in (
            "alt",
            "aria-label",
            "class",
            "data-image-type",
            "data-role",
            "id",
            "itemprop",
            "src",
            "data-src",
            "data-original",
            "data-lazy-src",
            "data-original-src",
        )
    ).casefold()
    negative_tokens = (
        "advert",
        "avatar",
        "badge",
        "banner",
        "button",
        "emoji",
        "favicon",
        "icon",
        "logo",
        "sprite",
        "tracking",
    )
    if any(token in text for token in negative_tokens):
        return False
    positive_tokens = (
        "article",
        "cover",
        "full",
        "hero",
        "main",
        "official",
        "photo",
        "picture",
        "portrait",
        "primary",
        "profile",
        "representative",
        "thumbnail",
    )
    if any(token in text for token in positive_tokens):
        return True
    width = _numeric_attr(attr.get("width", ""))
    height = _numeric_attr(attr.get("height", ""))
    loading = attr.get("loading", "").casefold()
    fetchpriority = attr.get("fetchpriority", "").casefold()
    return (
        width >= 300
        and height >= 300
        and (fetchpriority == "high" or loading != "lazy")
    )
 def _numeric_attr(value: str) -> int:
    match = re.search(r"\d+", str(value))
    if not match:
        return 0
    try:
        return int(match.group(0))
    except ValueError:
        return 0
 def _css_url_image_urls(style: str) -> list[str]:
    direct_urls = [
        match.group(2).strip()
        for match in re.finditer(r"url\(\s*(['\"]?)(.*?)\1\s*\)", style, flags=re.IGNORECASE)
        if match.group(2).strip()
    ]
    return _unique_texts([*_css_image_set_urls(style), *direct_urls])
 def _css_image_set_urls(style: str) -> list[str]:
    candidates: list[tuple[float, int, str]] = []
    order = 0
    for body in _css_image_set_bodies(style):
        for raw_candidate in _split_top_level_commas(body):
            url, descriptor = _css_image_set_candidate(raw_candidate)
            if not url:
                continue
            candidates.append((_css_image_set_descriptor_score(descriptor), order, url))
            order += 1
    return [
        url
        for _, _, url in sorted(
            candidates,
            key=lambda item: (-item[0], item[1]),
        )
    ]
 def _css_image_set_bodies(style: str) -> list[str]:
    bodies: list[str] = []
    for match in re.finditer(r"(?:-webkit-)?image-set\s*\(", style, flags=re.IGNORECASE):
        start = match.end()
        depth = 1
        quote = ""
        escaped = False
        for index in range(start, len(style)):
            character = style[index]
            if quote:
                if escaped:
                    escaped = False
                elif character == "\\":
                    escaped = True
                elif character == quote:
                    quote = ""
                continue
            if character in {"'", '"'}:
                quote = character
                continue
            if character == "(":
                depth += 1
                continue
            if character == ")":
                depth -= 1
                if depth == 0:
                    bodies.append(style[start:index])
                    break
    return bodies
 def _split_top_level_commas(value: str) -> list[str]:
    parts: list[str] = []
    start = 0
    depth = 0
    quote = ""
    escaped = False
    for index, character in enumerate(value):
        if quote:
            if escaped:
                escaped = False
            elif character == "\\":
                escaped = True
            elif character == quote:
                quote = ""
            continue
        if character in {"'", '"'}:
            quote = character
            continue
        if character == "(":
            depth += 1
            continue
        if character == ")":
            depth = max(0, depth - 1)
            continue
        if character == "," and depth == 0:
            parts.append(value[start:index].strip())
            start = index + 1
    parts.append(value[start:].strip())
    return [part for part in parts if part]
 def _css_image_set_candidate(value: str) -> tuple[str, str]:
    url_match = re.search(r"url\(\s*(['\"]?)(.*?)\1\s*\)", value, flags=re.IGNORECASE)
    if url_match:
        return url_match.group(2).strip(), value[url_match.end() :]
    quoted_match = re.search(r"""(['"])(.*?)\1""", value)
    if quoted_match:
        return quoted_match.group(2).strip(), value[quoted_match.end() :]
    parts = value.split(None, 1)
    if parts and _looks_like_image_reference(parts[0]):
        return parts[0].strip(), parts[1] if len(parts) > 1 else ""
    return "", ""
 def _css_image_set_descriptor_score(value: str) -> float:
    descriptor = value.strip().lower()
    match = re.search(r"([0-9]*\.?[0-9]+)\s*(dppx|dpi|x|w)\b", descriptor)
    if not match:
        return 0.0
    number = float(match.group(1))
    unit = match.group(2)
    if unit == "w":
        return number
    if unit == "dpi":
        return (number / 96) * 1000
    return number * 1000
 def _extract_page_image_urls(content: bytes, base_url: str, limit: int) -> list[str]:
    if limit <= 0:
        return []
    return [
        url
        for url in _page_image_references(content, base_url)[0]
        if _is_http_url(url)
    ][:limit]
 def _extract_page_stylesheet_urls(content: bytes, base_url: str, limit: int) -> list[str]:
    if limit <= 0:
        return []
    return [
        url
        for url in _page_image_references(content, base_url)[1]
        if _is_http_url(url)
    ][:limit]
 def _extract_css_image_urls(content: bytes, base_url: str, limit: int) -> list[str]:
    if limit <= 0:
        return []
    return [
        url
        for url in _unique_texts(
            _normalized_image_url(base_url, url)
            for url in _css_url_image_urls(content.decode("utf-8", errors="replace"))
        )
        if _is_http_url(url)
    ][:limit]
 def _page_image_references(content: bytes, base_url: str) -> tuple[list[str], list[str]]:
    parser = _PageImageParser()
    parser.feed(content.decode("utf-8", errors="replace"))
    image_urls = [
        url
        for url in _unique_texts(
            _normalized_image_url(base_url, url)
            for url in [*parser.priority_urls, *parser.image_urls]
        )
        if _is_http_url(url)
    ]
    stylesheet_urls = [
        url
        for url in _unique_texts(
            _normalized_image_url(base_url, url)
            for url in parser.stylesheet_urls
        )
        if _is_http_url(url)
    ]
    return image_urls, stylesheet_urls
 def _content_has_comparable_image_fingerprint(content: bytes) -> bool:
    try:
        fingerprint = FingerprintService().fingerprints_for(content).perceptual
    except Exception:
        return False
    return not fingerprint.startswith("phash:unavailable:")
 def _search_result_direct_image_urls(source_evidence: Evidence) -> list[str]:
    result_url = str(
        source_evidence.data.get("result_url", source_evidence.data.get("url", ""))
    )
    unwrapped_url = _unwrapped_image_url(result_url)
    if unwrapped_url:
        return [unwrapped_url]
    if _is_http_url(result_url) and _url_looks_like_image(result_url):
        return [result_url]
    return []
 def _normalized_image_url(base_url: str, url: str) -> str:
    text = _decoded_url_reference(str(url).strip())
    if not text or text.lower().startswith("data:"):
        return ""
    if _is_scheme_less_remote_image_url(text):
        text = f"https://{text.lstrip('/')}"
    normalized = urljoin(base_url, text)
    return _unwrapped_image_url(normalized) or normalized
 def _normalized_remote_image_url(url: str) -> str:
    text = _decoded_url_reference(str(url).strip())
    if not text or text.lower().startswith("data:"):
        return ""
    if _is_scheme_less_remote_image_url(text):
        text = f"https://{text.lstrip('/')}"
    return _unwrapped_image_url(text) or text
 def _unwrapped_image_url(url: str) -> str:
    if not _is_http_url(url):
        return ""
    parsed = urlparse(url)
    strong_keys = {
        "imgurl",
        "imageurl",
        "image_url",
        "mediaurl",
        "media_url",
        "contenturl",
        "content_url",
        "photo",
        "photo_url",
        "src",
        "source",
        "image",
        "img",
    }
    weak_keys = {"url", "u", "target", "redirect", "redirect_url"}
    for key, value in parse_qsl(parsed.query, keep_blank_values=False):
        key_text = key.lower().replace("-", "_")
        candidate = _decoded_nested_url(value)
        if not candidate:
            continue
        if not _is_http_url(candidate):
            if candidate.startswith("//"):
                candidate = f"https:{candidate}"
            elif _is_scheme_less_remote_image_url(candidate):
                candidate = f"https://{candidate.lstrip('/')}"
            elif candidate.startswith("/") or _url_looks_like_image(candidate):
                candidate = urljoin(url, candidate)
            else:
                continue
        if key_text in strong_keys:
            return candidate
        if key_text in weak_keys and _url_looks_like_image(candidate):
            return candidate
    return ""
 def _is_scheme_less_remote_image_url(value: str) -> bool:
    text = str(value).strip().lstrip("/")
    if not _url_looks_like_image(text):
        return False
    first_segment = text.split("/", 1)[0]
    if first_segment in {".", ".."} or first_segment.startswith("."):
        return False
    return "." in first_segment and " " not in first_segment
 def _decoded_url_reference(value: str) -> str:
    raw = str(value).strip()
    decoded = _decoded_nested_url(raw)
    if decoded == raw:
        return raw
    if (
        _is_http_url(decoded)
        or decoded.startswith(("/", "//", "./", "../"))
        or _is_scheme_less_remote_image_url(decoded)
        or _url_looks_like_image(decoded)
    ):
        return decoded
    return raw
--- a/src/rights_filter/server/store_text.py
+++ b/src/rights_filter/server/store_text.py
@ -0,0 +1,27 @@
 """Pure text-normalization helpers shared by the SQLite store and its
 extracted submodules. Extracted from sqlite_store.py; behavior unchanged.
 """
 from __future__ import annotations
 from typing import Any
 def _text_list(value: Any) -> list[str]:
    if value is None:
        return []
    if isinstance(value, list):
        return [str(item).strip() for item in value if str(item).strip()]
    return [item.strip() for item in str(value).split(",") if item.strip()]
 def _unique_texts(values: Any) -> list[str]:
    seen: set[str] = set()
    result: list[str] = []
    for value in values:
        text = str(value).strip()
        if not text or text in seen:
            continue
        seen.add(text)
        result.append(text)
    return result
--- a/tests/rights_filter/test_review_fixes.py
+++ b/tests/rights_filter/test_review_fixes.py
@ -16,7 +16,8 @@ from rights_filter.analysis.search_result_promoter import SearchResultPromoter
 from rights_filter.domain.records import Evidence, EvidenceSource
 from rights_filter.integrations.naver_search import NaverSearchAdapter
 from rights_filter.integrations.search_policy import SearchApiPolicy
-from rights_filter.server.sqlite_store import CopyrighterStore, _decode_javascript_string
+from rights_filter.server.sqlite_store import CopyrighterStore
 from rights_filter.server.store_page_scrape import _decode_javascript_string
 # --- #1 CRITICAL: constraint migration must not cascade-delete evidence -------