refactor: extract store constants and remaining domain helpers

Move shared constants + _bounded_int_env into store_constants (a leaf module), and the remaining module-level domain helpers (validation, query signatures, search-hint evidence, watchlist selection, knowledge type/provenance) into store_serialization. sqlite_store.py is now the CopyrighterStore class plus thin imports: 3613 -> 3368 lines (5333 -> 3368 overall, -37%). All behavior-preserving.
2026-06-20 21:38:03 +09:00 · 2026-06-20 21:38:03 +09:00 · 3be7b016ce
commit 3be7b016ce
parent 8e53139029
3 changed files with 314 additions and 273 deletions
--- a/src/rights_filter/server/sqlite_store.py
+++ b/src/rights_filter/server/sqlite_store.py
@ -28,7 +28,6 @@ from rights_filter.domain.records import (
    EvidenceSource,
    InMemoryRightsFilterRepository,
    KnowledgeBaseEntry,
    KnowledgeEntryType,
 )
 from rights_filter.integrations.cloud_vision_web_detection import (
    CloudVisionWebDetectionAdapter,
@ -38,6 +37,18 @@ from rights_filter.integrations.env_clients import ProviderRuntime, build_provid
 from rights_filter.integrations.external_policy import ExternalApiPolicy
 from rights_filter.jobs.batch_analyzer import BatchAnalyzer, SubmissionImage
 from rights_filter.server.image_store import LocalSubmissionImageStore, SUPPORTED_IMAGE_SUFFIXES
 from rights_filter.server.store_constants import (
    DEFAULT_COVERAGE_GOOD_THRESHOLD,
    DEFAULT_COVERAGE_WARN_THRESHOLD,
    DEFAULT_FACE_CROP_RETENTION_DAYS,
    DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD,
    DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD,
    EVIDENCE_OPERATOR_STATUSES,
    MAX_COVERAGE_THRESHOLD,
    MIN_COVERAGE_THRESHOLD,
    NON_CONTRIBUTING_OPERATOR_STATUSES,
    _bounded_int_env,
 )
 from rights_filter.server.store_remote_fetch import (
    _fetch_page_url_bytes,
    _fetch_stylesheet_url_bytes,
@ -58,27 +69,42 @@ from rights_filter.server.store_schema import (
    _ensure_typed_columns,
 )
 from rights_filter.server.store_serialization import (
    _default_evidence_contribution,
    _domain_evidence_from_ui,
    _evidence_id,
    _evidence_matches_provider,
    _evidence_payload,
    _existing_google_custom_query_signatures,
    _existing_naver_query_signatures,
    _external_provider_ids,
    _external_provider_state_for_submission,
    _face_crop_web_evidence,
    _google_custom_image_query_signature,
    _google_custom_web_query_signature,
    _google_weak_label_title,
    _image_size_from_bytes,
    _image_suffix_from_url,
    _is_google_weak_label_payload,
    _knowledge_entry_type,
    _knowledge_provenance,
    _knowledge_type_value,
    _naver_blog_query_signature,
    _naver_query_signature,
    _naver_web_query_signature,
    _now_label,
    _provider_item_failed,
    _provider_item_has_result,
    _query_history_status,
    _safe_filename,
    _safe_image_suffix,
    _stable_id,
    _strip_html,
    _submission_payload,
    _submission_search_hint_evidence,
    _timestamp_id,
    _validate_payload,
    _validate_table,
    _watchlist_source_evidence,
 )
 from rights_filter.server.store_text import _text_list, _unique_texts
 from rights_filter.server.store_url_utils import (
@ -90,116 +116,6 @@ from rights_filter.server.store_url_utils import (
 )
 EVIDENCE_OPERATOR_STATUSES = {
    "used_for_judgment": "판단에 사용",
    "irrelevant": "무관",
    "false_positive": "오탐",
    "pending": "보류",
 }
 NON_CONTRIBUTING_OPERATOR_STATUSES = {"irrelevant", "false_positive"}
 SUBMISSION_FINAL_STATUSES = {"approved", "rejected", "corrected"}
 QUEUE_ID_PLACEHOLDER = ""
 STORE_TABLES = {
    "submissions",
    "evidence",
    "providers",
    "knowledge_entries",
    "collection_candidates",
    "corrections",
    "audit_events",
    "submission_queues",
 }
 RISK_BANDS = ("low", "medium", "high", "failed", "pending")
 DECISION_STATUSES = ("unreviewed", "held", "rejected", "approved", "corrected")
 EVIDENCE_STATUSES = (
    "active",
    "auto",
    "manual",
    "queued",
    "rerun",
    "weak",
    "used_for_judgment",
    "irrelevant",
    "false_positive",
    "pending",
 )
 KNOWLEDGE_STATUSES = ("confirmed", "watchlist", "excluded")
 COLLECTION_STATUSES = ("candidate", "promoted")
 DEFAULT_COVERAGE_GOOD_THRESHOLD = 70
 DEFAULT_COVERAGE_WARN_THRESHOLD = 40
 DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD = 70
 DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD = 40
 MIN_COVERAGE_THRESHOLD = 0
 MAX_COVERAGE_THRESHOLD = 100
 DEFAULT_FACE_CROP_RETENTION_DAYS = 90
 PAYLOAD_REQUIRED_FIELDS = {
    "submissions": {
        "id",
        "title",
        "asset",
        "riskScore",
        "riskBand",
        "decisionStatus",
        "providerState",
        "fileFacts",
        "evidence",
    },
    "evidence": {
        "id",
        "source",
        "title",
        "confidence",
        "status",
        "submission_id",
    },
    "providers": {
        "id",
        "name",
        "enabled",
        "usage",
        "quota",
        "lastSuccess",
        "lastFailure",
    },
    "knowledge_entries": {
        "id",
        "name",
        "type",
        "provenance",
        "active",
        "entryStatus",
        "sampleFingerprints",
    },
    "collection_candidates": {
        "id",
        "provider",
        "query",
        "title",
        "status",
        "sourceUrl",
        "collectedEpoch",
        "sampleFingerprints",
    },
    "corrections": {
        "id",
    },
 }
 def _bounded_int_env(value: str | None, default: int, minimum: int, maximum: int) -> int:
    try:
        parsed = int(value) if value is not None else default
    except (TypeError, ValueError):
        return default
    if parsed < minimum:
        return minimum
    if parsed > maximum:
        return maximum
    return parsed
 class CopyrighterStore:
    def __init__(
        self,
@ -3450,164 +3366,3 @@ class CopyrighterStore:
        return grouped
 def _validate_table(table: str) -> None:
    if table not in STORE_TABLES:
        raise ValueError(f"unsupported store table: {table}")
 def _validate_payload(table: str, id_value: str, payload: dict[str, Any]) -> None:
    _validate_table(table)
    if not isinstance(payload, dict):
        raise ValueError(f"{table} payload must be an object")
    required = PAYLOAD_REQUIRED_FIELDS.get(table, set())
    missing = sorted(field for field in required if field not in payload)
    if missing:
        raise ValueError(f"{table} payload missing required fields: {', '.join(missing)}")
    if table != "audit_events" and "id" in payload and str(payload["id"]) != str(id_value):
        raise ValueError(f"{table} payload id does not match storage id")
 def _existing_naver_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
    return {
        str(item.get("querySignature") or _naver_query_signature(str(item.get("query", ""))))
        for item in evidence
        if item.get("query") and item.get("source") in {"naver", "failure"}
    }
 def _existing_google_custom_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
    return {
        str(item.get("querySignature") or "")
        for item in evidence
        if item.get("query")
        and item.get("source") in {"google", "failure"}
        and str(item.get("domain", "")) == "google_custom_search"
        and item.get("querySignature")
    }
 def _google_custom_image_query_signature(query: str) -> str:
    return "google-custom-image:" + " ".join(query.lower().split())
 def _google_custom_web_query_signature(query: str) -> str:
    return "google-custom-web:" + " ".join(query.lower().split())
 def _naver_query_signature(query: str) -> str:
    return "naver:" + " ".join(query.lower().split())
 def _naver_blog_query_signature(query: str) -> str:
    return "naver-blog:" + " ".join(query.lower().split())
 def _naver_web_query_signature(query: str) -> str:
    return "naver-web:" + " ".join(query.lower().split())
 def _submission_search_hint_evidence(record: dict[str, Any]) -> list[Evidence]:
    hints: list[Evidence] = []
    title = str(record.get("title", "")).strip()
    if title:
        hints.append(_local_query_hint_evidence(title, "title"))
    file_value = str(record.get("file", record.get("asset", ""))).strip()
    file_stem = Path(urlparse(file_value).path).stem if file_value else ""
    if file_stem and file_stem != title:
        hints.append(_local_query_hint_evidence(file_stem, "file"))
    return hints
 def _local_query_hint_evidence(query: str, hint_source: str) -> Evidence:
    return Evidence(
        source=EvidenceSource.FINGERPRINT,
        reason="Local submission search hint",
        confidence=0.0,
        data={
            "local_query_hint": True,
            "query": query,
            "hint_source": hint_source,
        },
    )
 def _query_history_status(evidence: list[Evidence]) -> str:
    if any(item.source == EvidenceSource.ENRICHMENT_FAILURE for item in evidence):
        return "failed"
    if evidence and all(item.source == EvidenceSource.SEARCH_SKIPPED for item in evidence):
        return "skipped"
    return "auto"
 def _default_evidence_contribution(payload: dict[str, Any]) -> bool:
    source = str(payload.get("source", ""))
    if source in {"llm", "failure"}:
        return False
    if bool(payload.get("faceCropSearch", False)):
        return False
    if _is_google_weak_label_payload(payload):
        return False
    return True
 def _watchlist_source_evidence(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
    used = [
        item
        for item in evidence
        if item.get("operatorStatus") == "used_for_judgment"
        and item.get("id")
    ]
    if used:
        return _sorted_evidence_for_watchlist(used)[:5]
    candidates = [
        item
        for item in evidence
        if item.get("id")
        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
        and item.get("source") not in {"llm", "failure"}
        and item.get("title") != "Image fingerprints generated"
    ]
    if candidates:
        return _sorted_evidence_for_watchlist(candidates)[:5]
    fallback = [
        item
        for item in evidence
        if item.get("id")
        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
    ]
    return _sorted_evidence_for_watchlist(fallback)[:5]
 def _sorted_evidence_for_watchlist(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
    return sorted(
        evidence,
        key=lambda item: (
            1 if item.get("contributed", True) else 0,
            float(item.get("confidence", 0) or 0),
            str(item.get("retrievedAt", "")),
        ),
        reverse=True,
    )
 def _knowledge_type_value(value: str) -> str:
    normalized = value.strip() or "other"
    aliases = {
        "public_figure": "celebrity",
        "rejected_reference": "rejected_image",
    }
    return aliases.get(normalized, normalized)
 def _knowledge_entry_type(value: str) -> KnowledgeEntryType:
    try:
        return KnowledgeEntryType(_knowledge_type_value(value))
    except ValueError:
        return KnowledgeEntryType.OTHER
--- a/src/rights_filter/server/store_constants.py
+++ b/src/rights_filter/server/store_constants.py
@ -0,0 +1,113 @@
 """Shared constants and small config helpers for the SQLite store and its
 extracted submodules. Leaf module (no internal imports) so any store module can
 depend on it without import cycles. Extracted from sqlite_store.py unchanged.
 """
 from __future__ import annotations
 EVIDENCE_OPERATOR_STATUSES = {
    "used_for_judgment": "판단에 사용",
    "irrelevant": "무관",
    "false_positive": "오탐",
    "pending": "보류",
 }
 NON_CONTRIBUTING_OPERATOR_STATUSES = {"irrelevant", "false_positive"}
 SUBMISSION_FINAL_STATUSES = {"approved", "rejected", "corrected"}
 QUEUE_ID_PLACEHOLDER = ""
 STORE_TABLES = {
    "submissions",
    "evidence",
    "providers",
    "knowledge_entries",
    "collection_candidates",
    "corrections",
    "audit_events",
    "submission_queues",
 }
 RISK_BANDS = ("low", "medium", "high", "failed", "pending")
 DECISION_STATUSES = ("unreviewed", "held", "rejected", "approved", "corrected")
 EVIDENCE_STATUSES = (
    "active",
    "auto",
    "manual",
    "queued",
    "rerun",
    "weak",
    "used_for_judgment",
    "irrelevant",
    "false_positive",
    "pending",
 )
 KNOWLEDGE_STATUSES = ("confirmed", "watchlist", "excluded")
 COLLECTION_STATUSES = ("candidate", "promoted")
 DEFAULT_COVERAGE_GOOD_THRESHOLD = 70
 DEFAULT_COVERAGE_WARN_THRESHOLD = 40
 DEFAULT_QUERY_COVERAGE_GOOD_THRESHOLD = 70
 DEFAULT_QUERY_COVERAGE_WARN_THRESHOLD = 40
 MIN_COVERAGE_THRESHOLD = 0
 MAX_COVERAGE_THRESHOLD = 100
 DEFAULT_FACE_CROP_RETENTION_DAYS = 90
 PAYLOAD_REQUIRED_FIELDS = {
    "submissions": {
        "id",
        "title",
        "asset",
        "riskScore",
        "riskBand",
        "decisionStatus",
        "providerState",
        "fileFacts",
        "evidence",
    },
    "evidence": {
        "id",
        "source",
        "title",
        "confidence",
        "status",
        "submission_id",
    },
    "providers": {
        "id",
        "name",
        "enabled",
        "usage",
        "quota",
        "lastSuccess",
        "lastFailure",
    },
    "knowledge_entries": {
        "id",
        "name",
        "type",
        "provenance",
        "active",
        "entryStatus",
        "sampleFingerprints",
    },
    "collection_candidates": {
        "id",
        "provider",
        "query",
        "title",
        "status",
        "sourceUrl",
        "collectedEpoch",
        "sampleFingerprints",
    },
    "corrections": {
        "id",
    },
 }
 def _bounded_int_env(value: str | None, default: int, minimum: int, maximum: int) -> int:
    try:
        parsed = int(value) if value is not None else default
    except (TypeError, ValueError):
        return default
    if parsed < minimum:
        return minimum
    if parsed > maximum:
        return maximum
    return parsed
--- a/src/rights_filter/server/store_serialization.py
+++ b/src/rights_filter/server/store_serialization.py
@ -18,8 +18,18 @@ from pathlib import Path
 from typing import Any
 from urllib.parse import urlparse
-from rights_filter.domain.records import Evidence, EvidenceSource, KnowledgeProvenance
+from rights_filter.domain.records import (
    Evidence,
    EvidenceSource,
    KnowledgeEntryType,
    KnowledgeProvenance,
 )
 from rights_filter.server.image_store import SUPPORTED_IMAGE_SUFFIXES
 from rights_filter.server.store_constants import (
    NON_CONTRIBUTING_OPERATOR_STATUSES,
    PAYLOAD_REQUIRED_FIELDS,
    STORE_TABLES,
 )
 from rights_filter.server.store_text import _text_list, _unique_texts
@ -422,3 +432,166 @@ def _label_to_epoch(value: str) -> int:
 def _timestamp_id() -> str:
    return datetime.now().strftime("%Y%m%d%H%M%S%f")
 def _validate_table(table: str) -> None:
    if table not in STORE_TABLES:
        raise ValueError(f"unsupported store table: {table}")
 def _validate_payload(table: str, id_value: str, payload: dict[str, Any]) -> None:
    _validate_table(table)
    if not isinstance(payload, dict):
        raise ValueError(f"{table} payload must be an object")
    required = PAYLOAD_REQUIRED_FIELDS.get(table, set())
    missing = sorted(field for field in required if field not in payload)
    if missing:
        raise ValueError(f"{table} payload missing required fields: {', '.join(missing)}")
    if table != "audit_events" and "id" in payload and str(payload["id"]) != str(id_value):
        raise ValueError(f"{table} payload id does not match storage id")
 def _existing_naver_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
    return {
        str(item.get("querySignature") or _naver_query_signature(str(item.get("query", ""))))
        for item in evidence
        if item.get("query") and item.get("source") in {"naver", "failure"}
    }
 def _existing_google_custom_query_signatures(evidence: list[dict[str, Any]]) -> set[str]:
    return {
        str(item.get("querySignature") or "")
        for item in evidence
        if item.get("query")
        and item.get("source") in {"google", "failure"}
        and str(item.get("domain", "")) == "google_custom_search"
        and item.get("querySignature")
    }
 def _google_custom_image_query_signature(query: str) -> str:
    return "google-custom-image:" + " ".join(query.lower().split())
 def _google_custom_web_query_signature(query: str) -> str:
    return "google-custom-web:" + " ".join(query.lower().split())
 def _naver_query_signature(query: str) -> str:
    return "naver:" + " ".join(query.lower().split())
 def _naver_blog_query_signature(query: str) -> str:
    return "naver-blog:" + " ".join(query.lower().split())
 def _naver_web_query_signature(query: str) -> str:
    return "naver-web:" + " ".join(query.lower().split())
 def _submission_search_hint_evidence(record: dict[str, Any]) -> list[Evidence]:
    hints: list[Evidence] = []
    title = str(record.get("title", "")).strip()
    if title:
        hints.append(_local_query_hint_evidence(title, "title"))
    file_value = str(record.get("file", record.get("asset", ""))).strip()
    file_stem = Path(urlparse(file_value).path).stem if file_value else ""
    if file_stem and file_stem != title:
        hints.append(_local_query_hint_evidence(file_stem, "file"))
    return hints
 def _local_query_hint_evidence(query: str, hint_source: str) -> Evidence:
    return Evidence(
        source=EvidenceSource.FINGERPRINT,
        reason="Local submission search hint",
        confidence=0.0,
        data={
            "local_query_hint": True,
            "query": query,
            "hint_source": hint_source,
        },
    )
 def _query_history_status(evidence: list[Evidence]) -> str:
    if any(item.source == EvidenceSource.ENRICHMENT_FAILURE for item in evidence):
        return "failed"
    if evidence and all(item.source == EvidenceSource.SEARCH_SKIPPED for item in evidence):
        return "skipped"
    return "auto"
 def _default_evidence_contribution(payload: dict[str, Any]) -> bool:
    source = str(payload.get("source", ""))
    if source in {"llm", "failure"}:
        return False
    if bool(payload.get("faceCropSearch", False)):
        return False
    if _is_google_weak_label_payload(payload):
        return False
    return True
 def _watchlist_source_evidence(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
    used = [
        item
        for item in evidence
        if item.get("operatorStatus") == "used_for_judgment"
        and item.get("id")
    ]
    if used:
        return _sorted_evidence_for_watchlist(used)[:5]
    candidates = [
        item
        for item in evidence
        if item.get("id")
        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
        and item.get("source") not in {"llm", "failure"}
        and item.get("title") != "Image fingerprints generated"
    ]
    if candidates:
        return _sorted_evidence_for_watchlist(candidates)[:5]
    fallback = [
        item
        for item in evidence
        if item.get("id")
        and item.get("operatorStatus") not in NON_CONTRIBUTING_OPERATOR_STATUSES
    ]
    return _sorted_evidence_for_watchlist(fallback)[:5]
 def _sorted_evidence_for_watchlist(evidence: list[dict[str, Any]]) -> list[dict[str, Any]]:
    return sorted(
        evidence,
        key=lambda item: (
            1 if item.get("contributed", True) else 0,
            float(item.get("confidence", 0) or 0),
            str(item.get("retrievedAt", "")),
        ),
        reverse=True,
    )
 def _knowledge_type_value(value: str) -> str:
    normalized = value.strip() or "other"
    aliases = {
        "public_figure": "celebrity",
        "rejected_reference": "rejected_image",
    }
    return aliases.get(normalized, normalized)
 def _knowledge_entry_type(value: str) -> KnowledgeEntryType:
    try:
        return KnowledgeEntryType(_knowledge_type_value(value))
    except ValueError:
        return KnowledgeEntryType.OTHER