refactor: extract store constants and remaining domain helpers

Move shared constants + _bounded_int_env into store_constants (a leaf module), and the remaining module-level domain helpers (validation, query signatures, search-hint evidence, watchlist selection, knowledge type/provenance) into store_serialization. sqlite_store.py is now the CopyrighterStore class plus thin imports: 3613 -> 3368 lines (5333 -> 3368 overall, -37%). All behavior-preserving.
2026-06-20 21:38:03 +09:00 · 2026-06-20 21:38:03 +09:00 · 3be7b016ce
commit 3be7b016ce
parent 8e53139029
3 changed files with 314 additions and 273 deletions
--- a/src/rights_filter/server/sqlite_store.py
+++ b/src/rights_filter/server/sqlite_store.py
@ -28,7 +28,6 @@ from rights_filter.domain.records import (
    EvidenceSource,
    InMemoryRightsFilterRepository,
    KnowledgeBaseEntry,
-    KnowledgeEntryType,
 )
 from rights_filter.integrations.cloud_vision_web_detection import (
    CloudVisionWebDetectionAdapter,
@ -38,6 +37,18 @@ from rights_filter.integrations.env_clients import ProviderRuntime, build_provid
 from rights_filter.integrations.external_policy import ExternalApiPolicy
 from rights_filter.jobs.batch_analyzer import BatchAnalyzer, SubmissionImage
 from rights_filter.server.image_store import LocalSubmissionImageStore, SUPPORTED_IMAGE_SUFFIXES
+from rights_filter.server.store_constants import (
+    DEFAULT_COVERAGE_GOOD_THRESHOLD,
+    DEFAULT_COVERAGE_WARN_THRESHOLD,
+    DEFAULT_FACE_CROP_RETENTION_DAYS,
+    DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD,
+    DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD,
+    EVIDENCE_OPERATOR_STATUSES,
+    MAX_COVERAGE_THRESHOLD,
+    MIN_COVERAGE_THRESHOLD,
+    NON_CONTRIBUTING_OPERATOR_STATUSES,
+    _bounded_int_env,
+)
 from rights_filter.server.store_remote_fetch import (
    _fetch_page_url_bytes,
    _fetch_stylesheet_url_bytes,
@ -58,27 +69,42 @@ from rights_filter.server.store_schema import (
    _ensure_typed_columns,
 )
 from rights_filter.server.store_serialization import (
+    _default_evidence_contribution,
    _domain_evidence_from_ui,
    _evidence_id,
    _evidence_matches_provider,
    _evidence_payload,
+    _existing_google_custom_query_signatures,
+    _existing_naver_query_signatures,
    _external_provider_ids,
    _external_provider_state_for_submission,
    _face_crop_web_evidence,
+    _google_custom_image_query_signature,
+    _google_custom_web_query_signature,
    _google_weak_label_title,
    _image_size_from_bytes,
    _image_suffix_from_url,
    _is_google_weak_label_payload,
+    _knowledge_entry_type,
    _knowledge_provenance,
+    _knowledge_type_value,
+    _naver_blog_query_signature,
+    _naver_query_signature,
+    _naver_web_query_signature,
    _now_label,
    _provider_item_failed,
    _provider_item_has_result,
+    _query_history_status,
    _safe_filename,
    _safe_image_suffix,
    _stable_id,
    _strip_html,
    _submission_payload,
+    _submission_search_hint_evidence,
    _timestamp_id,
+    _validate_payload,
+    _validate_table,
+    _watchlist_source_evidence,
 )
 from rights_filter.server.store_text import _text_list, _unique_texts
 from rights_filter.server.store_url_utils import (
@ -90,116 +116,6 @@ from rights_filter.server.store_url_utils import (
 )


-EVIDENCE_OPERATOR_STATUSES = {
-    "used_for_judgment": "판단에 사용",
-    "irrelevant": "무관",
-    "false_positive": "오탐",
-    "pending": "보류",
-}
-NON_CONTRIBUTING_OPERATOR_STATUSES = {"irrelevant", "false_positive"}
-SUBMISSION_FINAL_STATUSES = {"approved", "rejected", "corrected"}
-QUEUE_ID_PLACEHOLDER = ""
-STORE_TABLES = {
-    "submissions",
-    "evidence",
-    "providers",
-    "knowledge_entries",
-    "collection_candidates",
-    "corrections",
-    "audit_events",
-    "submission_queues",
-}
-RISK_BANDS = ("low", "medium", "high", "failed", "pending")
-DECISION_STATUSES = ("unreviewed", "held", "rejected", "approved", "corrected")
-EVIDENCE_STATUSES = (
-    "active",
-    "auto",
-    "manual",
-    "queued",
-    "rerun",
-    "weak",
-    "used_for_judgment",
-    "irrelevant",
-    "false_positive",
-    "pending",
-)
-KNOWLEDGE_STATUSES = ("confirmed", "watchlist", "excluded")
-COLLECTION_STATUSES = ("candidate", "promoted")
-DEFAULT_COVERAGE_GOOD_THRESHOLD = 70
-DEFAULT_COVERAGE_WARN_THRESHOLD = 40
-DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD = 70
-DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD = 40
-MIN_COVERAGE_THRESHOLD = 0
-MAX_COVERAGE_THRESHOLD = 100
-DEFAULT_FACE_CROP_RETENTION_DAYS = 90
-PAYLOAD_REQUIRED_FIELDS = {
-    "submissions": {
-        "id",
-        "title",
-        "asset",
-        "riskScore",
-        "riskBand",
-        "decisionStatus",
-        "providerState",
-        "fileFacts",
-        "evidence",
-    },
-    "evidence": {
-        "id",
-        "source",
-        "title",
-        "confidence",
-        "status",
-        "submission_id",
-    },
-    "providers": {
-        "id",
-        "name",
-        "enabled",
-        "usage",
-        "quota",
-        "lastSuccess",
-        "lastFailure",
-    },
-    "knowledge_entries": {
-        "id",
-        "name",
-        "type",
-        "provenance",
-        "active",
-        "entryStatus",
-        "sampleFingerprints",
-    },
-    "collection_candidates": {
-        "id",
-        "provider",
-        "query",
-        "title",
-        "status",
-        "sourceUrl",
-        "collectedEpoch",
-        "sampleFingerprints",
-    },
-    "corrections": {
-        "id",
-    },
-}
-
-
-
-
-def _bounded_int_env(value: str | None, default: int, minimum: int, maximum: int) -> int:
-    try:
-        parsed = int(value) if value is not None else default
-    except (TypeError, ValueError):
-        return default
-    if parsed < minimum:
-        return minimum
-    if parsed > maximum:
-        return maximum
-    return parsed
-
-
 class CopyrighterStore:
    def __init__(
        self,
@ -3450,164 +3366,3 @@ class CopyrighterStore:
        return grouped


-def _validate_table(table: str) -> None:
-    if table not in STORE_TABLES:
-        raise ValueError(f"unsupported store table: {table}")
-
-
-def _validate_payload(table: str, id_value: str, payload: dict[str, Any]) -> None:
-    _validate_table(table)
-    if not isinstance(payload, dict):
-        raise ValueError(f"{table} payload must be an object")
-
-    required = PAYLOAD_REQUIRED_FIELDS.get(table, set())
-    missing = sorted(field for field in required if field not in payload)
-    if missing:
-        raise ValueError(f"{table} payload missing required fields: {', '.join(missing)}")
-
-    if table != "audit_events" and "id" in payload and str(payload["id"]) != str(id_value):
-        raise ValueError(f"{table} payload id does not match storage id")
-
-
-def _existing_naver_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
-    return {
-        str(item.get("querySignature") or _naver_query_signature(str(item.get("query", ""))))
-        for item in evidence
-        if item.get("query") and item.get("source") in {"naver", "failure"}
-    }
-
-
-def _existing_google_custom_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
-    return {
-        str(item.get("querySignature") or "")
-        for item in evidence
-        if item.get("query")
-        and item.get("source") in {"google", "failure"}
-        and str(item.get("domain", "")) == "google_custom_search"
-        and item.get("querySignature")
-    }
-
-
-def _google_custom_image_query_signature(query: str) -> str:
-    return "google-custom-image:" + " ".join(query.lower().split())
-
-
-def _google_custom_web_query_signature(query: str) -> str:
-    return "google-custom-web:" + " ".join(query.lower().split())
-
-
-def _naver_query_signature(query: str) -> str:
-    return "naver:" + " ".join(query.lower().split())
-
-
-def _naver_blog_query_signature(query: str) -> str:
-    return "naver-blog:" + " ".join(query.lower().split())
-
-
-def _naver_web_query_signature(query: str) -> str:
-    return "naver-web:" + " ".join(query.lower().split())
-
-
-def _submission_search_hint_evidence(record: dict[str, Any]) -> list[Evidence]:
-    hints: list[Evidence] = []
-    title = str(record.get("title", "")).strip()
-    if title:
-        hints.append(_local_query_hint_evidence(title, "title"))
-
-    file_value = str(record.get("file", record.get("asset", ""))).strip()
-    file_stem = Path(urlparse(file_value).path).stem if file_value else ""
-    if file_stem and file_stem != title:
-        hints.append(_local_query_hint_evidence(file_stem, "file"))
-    return hints
-
-
-def _local_query_hint_evidence(query: str, hint_source: str) -> Evidence:
-    return Evidence(
-        source=EvidenceSource.FINGERPRINT,
-        reason="Local submission search hint",
-        confidence=0.0,
-        data={
-            "local_query_hint": True,
-            "query": query,
-            "hint_source": hint_source,
-        },
-    )
-
-
-def _query_history_status(evidence: list[Evidence]) -> str:
-    if any(item.source == EvidenceSource.ENRICHMENT_FAILURE for item in evidence):
-        return "failed"
-    if evidence and all(item.source == EvidenceSource.SEARCH_SKIPPED for item in evidence):
-        return "skipped"
-    return "auto"
-
-
-def _default_evidence_contribution(payload: dict[str, Any]) -> bool:
-    source = str(payload.get("source", ""))
-    if source in {"llm", "failure"}:
-        return False
-    if bool(payload.get("faceCropSearch", False)):
-        return False
-    if _is_google_weak_label_payload(payload):
-        return False
-    return True
-
-
-def _watchlist_source_evidence(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    used = [
-        item
-        for item in evidence
-        if item.get("operatorStatus") == "used_for_judgment"
-        and item.get("id")
-    ]
-    if used:
-        return _sorted_evidence_for_watchlist(used)[:5]
-
-    candidates = [
-        item
-        for item in evidence
-        if item.get("id")
-        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
-        and item.get("source") not in {"llm", "failure"}
-        and item.get("title") != "Image fingerprints generated"
-    ]
-    if candidates:
-        return _sorted_evidence_for_watchlist(candidates)[:5]
-
-    fallback = [
-        item
-        for item in evidence
-        if item.get("id")
-        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
-    ]
-    return _sorted_evidence_for_watchlist(fallback)[:5]
-
-
-def _sorted_evidence_for_watchlist(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
-    return sorted(
-        evidence,
-        key=lambda item: (
-            1 if item.get("contributed", True) else 0,
-            float(item.get("confidence", 0) or 0),
-            str(item.get("retrievedAt", "")),
-        ),
-        reverse=True,
-    )
-
-
-def _knowledge_type_value(value: str) -> str:
-    normalized = value.strip() or "other"
-    aliases = {
-        "public_figure": "celebrity",
-        "rejected_reference": "rejected_image",
-    }
-    return aliases.get(normalized, normalized)
-
-
-def _knowledge_entry_type(value: str) -> KnowledgeEntryType:
-    try:
-        return KnowledgeEntryType(_knowledge_type_value(value))
-    except ValueError:
-        return KnowledgeEntryType.OTHER
-
-
--- a/src/rights_filter/server/store_constants.py
+++ b/src/rights_filter/server/store_constants.py
@ -0,0 +1,113 @@
+"""Shared constants and small config helpers for the SQLite store and its
+extracted submodules. Leaf module (no internal imports) so any store module can
+depend on it without import cycles. Extracted from sqlite_store.py unchanged.
+"""
+
+from __future__ import annotations
+
+EVIDENCE_OPERATOR_STATUSES = {
+    "used_for_judgment": "판단에 사용",
+    "irrelevant": "무관",
+    "false_positive": "오탐",
+    "pending": "보류",
+}
+NON_CONTRIBUTING_OPERATOR_STATUSES = {"irrelevant", "false_positive"}
+SUBMISSION_FINAL_STATUSES = {"approved", "rejected", "corrected"}
+QUEUE_ID_PLACEHOLDER = ""
+STORE_TABLES = {
+    "submissions",
+    "evidence",
+    "providers",
+    "knowledge_entries",
+    "collection_candidates",
+    "corrections",
+    "audit_events",
+    "submission_queues",
+}
+RISK_BANDS = ("low", "medium", "high", "failed", "pending")
+DECISION_STATUSES = ("unreviewed", "held", "rejected", "approved", "corrected")
+EVIDENCE_STATUSES = (
+    "active",
+    "auto",
+    "manual",
+    "queued",
+    "rerun",
+    "weak",
+    "used_for_judgment",
+    "irrelevant",
+    "false_positive",
+    "pending",
+)
+KNOWLEDGE_STATUSES = ("confirmed", "watchlist", "excluded")
+COLLECTION_STATUSES = ("candidate", "promoted")
+DEFAULT_COVERAGE_GOOD_THRESHOLD = 70
+DEFAULT_COVERAGE_WARN_THRESHOLD = 40
+DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD = 70
+DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD = 40
+MIN_COVERAGE_THRESHOLD = 0
+MAX_COVERAGE_THRESHOLD = 100
+DEFAULT_FACE_CROP_RETENTION_DAYS = 90
+PAYLOAD_REQUIRED_FIELDS = {
+    "submissions": {
+        "id",
+        "title",
+        "asset",
+        "riskScore",
+        "riskBand",
+        "decisionStatus",
+        "providerState",
+        "fileFacts",
+        "evidence",
+    },
+    "evidence": {
+        "id",
+        "source",
+        "title",
+        "confidence",
+        "status",
+        "submission_id",
+    },
+    "providers": {
+        "id",
+        "name",
+        "enabled",
+        "usage",
+        "quota",
+        "lastSuccess",
+        "lastFailure",
+    },
+    "knowledge_entries": {
+        "id",
+        "name",
+        "type",
+        "provenance",
+        "active",
+        "entryStatus",
+        "sampleFingerprints",
+    },
+    "collection_candidates": {
+        "id",
+        "provider",
+        "query",
+        "title",
+        "status",
+        "sourceUrl",
+        "collectedEpoch",
+        "sampleFingerprints",
+    },
+    "corrections": {
+        "id",
+    },
+}
+
+
+def _bounded_int_env(value: str | None, default: int, minimum: int, maximum: int) -> int:
+    try:
+        parsed = int(value) if value is not None else default
+    except (TypeError, ValueError):
+        return default
+    if parsed < minimum:
+        return minimum
+    if parsed > maximum:
+        return maximum
+    return parsed
--- a/src/rights_filter/server/store_serialization.py
+++ b/src/rights_filter/server/store_serialization.py
@ -18,8 +18,18 @@ from pathlib import Path
 from typing import Any
 from urllib.parse import urlparse

-from rights_filter.domain.records import Evidence, EvidenceSource, KnowledgeProvenance
+from rights_filter.domain.records import (
+    Evidence,
+    EvidenceSource,
+    KnowledgeEntryType,
+    KnowledgeProvenance,
+)
 from rights_filter.server.image_store import SUPPORTED_IMAGE_SUFFIXES
+from rights_filter.server.store_constants import (
+    NON_CONTRIBUTING_OPERATOR_STATUSES,
+    PAYLOAD_REQUIRED_FIELDS,
+    STORE_TABLES,
+)
 from rights_filter.server.store_text import _text_list, _unique_texts


@ -422,3 +432,166 @@ def _label_to_epoch(value: str) -> int:

 def _timestamp_id() -> str:
    return datetime.now().strftime("%Y%m%d%H%M%S%f")
+
+
+def _validate_table(table: str) -> None:
+    if table not in STORE_TABLES:
+        raise ValueError(f"unsupported store table: {table}")
+
+
+def _validate_payload(table: str, id_value: str, payload: dict[str, Any]) -> None:
+    _validate_table(table)
+    if not isinstance(payload, dict):
+        raise ValueError(f"{table} payload must be an object")
+
+    required = PAYLOAD_REQUIRED_FIELDS.get(table, set())
+    missing = sorted(field for field in required if field not in payload)
+    if missing:
+        raise ValueError(f"{table} payload missing required fields: {', '.join(missing)}")
+
+    if table != "audit_events" and "id" in payload and str(payload["id"]) != str(id_value):
+        raise ValueError(f"{table} payload id does not match storage id")
+
+
+def _existing_naver_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
+    return {
+        str(item.get("querySignature") or _naver_query_signature(str(item.get("query", ""))))
+        for item in evidence
+        if item.get("query") and item.get("source") in {"naver", "failure"}
+    }
+
+
+def _existing_google_custom_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
+    return {
+        str(item.get("querySignature") or "")
+        for item in evidence
+        if item.get("query")
+        and item.get("source") in {"google", "failure"}
+        and str(item.get("domain", "")) == "google_custom_search"
+        and item.get("querySignature")
+    }
+
+
+def _google_custom_image_query_signature(query: str) -> str:
+    return "google-custom-image:" + " ".join(query.lower().split())
+
+
+def _google_custom_web_query_signature(query: str) -> str:
+    return "google-custom-web:" + " ".join(query.lower().split())
+
+
+def _naver_query_signature(query: str) -> str:
+    return "naver:" + " ".join(query.lower().split())
+
+
+def _naver_blog_query_signature(query: str) -> str:
+    return "naver-blog:" + " ".join(query.lower().split())
+
+
+def _naver_web_query_signature(query: str) -> str:
+    return "naver-web:" + " ".join(query.lower().split())
+
+
+def _submission_search_hint_evidence(record: dict[str, Any]) -> list[Evidence]:
+    hints: list[Evidence] = []
+    title = str(record.get("title", "")).strip()
+    if title:
+        hints.append(_local_query_hint_evidence(title, "title"))
+
+    file_value = str(record.get("file", record.get("asset", ""))).strip()
+    file_stem = Path(urlparse(file_value).path).stem if file_value else ""
+    if file_stem and file_stem != title:
+        hints.append(_local_query_hint_evidence(file_stem, "file"))
+    return hints
+
+
+def _local_query_hint_evidence(query: str, hint_source: str) -> Evidence:
+    return Evidence(
+        source=EvidenceSource.FINGERPRINT,
+        reason="Local submission search hint",
+        confidence=0.0,
+        data={
+            "local_query_hint": True,
+            "query": query,
+            "hint_source": hint_source,
+        },
+    )
+
+
+def _query_history_status(evidence: list[Evidence]) -> str:
+    if any(item.source == EvidenceSource.ENRICHMENT_FAILURE for item in evidence):
+        return "failed"
+    if evidence and all(item.source == EvidenceSource.SEARCH_SKIPPED for item in evidence):
+        return "skipped"
+    return "auto"
+
+
+def _default_evidence_contribution(payload: dict[str, Any]) -> bool:
+    source = str(payload.get("source", ""))
+    if source in {"llm", "failure"}:
+        return False
+    if bool(payload.get("faceCropSearch", False)):
+        return False
+    if _is_google_weak_label_payload(payload):
+        return False
+    return True
+
+
+def _watchlist_source_evidence(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    used = [
+        item
+        for item in evidence
+        if item.get("operatorStatus") == "used_for_judgment"
+        and item.get("id")
+    ]
+    if used:
+        return _sorted_evidence_for_watchlist(used)[:5]
+
+    candidates = [
+        item
+        for item in evidence
+        if item.get("id")
+        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
+        and item.get("source") not in {"llm", "failure"}
+        and item.get("title") != "Image fingerprints generated"
+    ]
+    if candidates:
+        return _sorted_evidence_for_watchlist(candidates)[:5]
+
+    fallback = [
+        item
+        for item in evidence
+        if item.get("id")
+        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
+    ]
+    return _sorted_evidence_for_watchlist(fallback)[:5]
+
+
+def _sorted_evidence_for_watchlist(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    return sorted(
+        evidence,
+        key=lambda item: (
+            1 if item.get("contributed", True) else 0,
+            float(item.get("confidence", 0) or 0),
+            str(item.get("retrievedAt", "")),
+        ),
+        reverse=True,
+    )
+
+
+def _knowledge_type_value(value: str) -> str:
+    normalized = value.strip() or "other"
+    aliases = {
+        "public_figure": "celebrity",
+        "rejected_reference": "rejected_image",
+    }
+    return aliases.get(normalized, normalized)
+
+
+def _knowledge_entry_type(value: str) -> KnowledgeEntryType:
+    try:
+        return KnowledgeEntryType(_knowledge_type_value(value))
+    except ValueError:
+        return KnowledgeEntryType.OTHER
+
+