refactor: extract search-result similarity and candidate storage into mixin
Move the search-result image similarity, candidate-image storage, in-memory knowledge repository, and rescoring methods into StoreSearchCandidatesMixin; CopyrighterStore inherits it. Drop now-unused imports. sqlite_store.py 3072 -> 2358 lines (5333 -> 2358 overall, -56%). Behavior-preserving.
This commit is contained in:
parent
40501e13f1
commit
8e0a8c307d
2 changed files with 745 additions and 718 deletions
|
|
@ -1,10 +1,8 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import base64
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import threading
|
import threading
|
||||||
|
|
@ -52,15 +50,8 @@ from rights_filter.server.store_remote_fetch import (
|
||||||
_fetch_stylesheet_url_bytes,
|
_fetch_stylesheet_url_bytes,
|
||||||
_fetch_url_bytes,
|
_fetch_url_bytes,
|
||||||
)
|
)
|
||||||
from rights_filter.server.store_page_scrape import (
|
|
||||||
_content_has_comparable_image_fingerprint,
|
|
||||||
_extract_css_image_urls,
|
|
||||||
_extract_page_image_urls,
|
|
||||||
_extract_page_stylesheet_urls,
|
|
||||||
_normalized_remote_image_url,
|
|
||||||
_search_result_direct_image_urls,
|
|
||||||
)
|
|
||||||
from rights_filter.server.store_persistence import StorePersistenceMixin
|
from rights_filter.server.store_persistence import StorePersistenceMixin
|
||||||
|
from rights_filter.server.store_search_candidates import StoreSearchCandidatesMixin
|
||||||
from rights_filter.server.store_schema import (
|
from rights_filter.server.store_schema import (
|
||||||
_ensure_constrained_schema,
|
_ensure_constrained_schema,
|
||||||
_ensure_queue_schema,
|
_ensure_queue_schema,
|
||||||
|
|
@ -81,11 +72,7 @@ from rights_filter.server.store_serialization import (
|
||||||
_google_custom_image_query_signature,
|
_google_custom_image_query_signature,
|
||||||
_google_custom_web_query_signature,
|
_google_custom_web_query_signature,
|
||||||
_google_weak_label_title,
|
_google_weak_label_title,
|
||||||
_image_size_from_bytes,
|
|
||||||
_image_suffix_from_url,
|
|
||||||
_is_google_weak_label_payload,
|
_is_google_weak_label_payload,
|
||||||
_knowledge_entry_type,
|
|
||||||
_knowledge_provenance,
|
|
||||||
_knowledge_type_value,
|
_knowledge_type_value,
|
||||||
_naver_blog_query_signature,
|
_naver_blog_query_signature,
|
||||||
_naver_query_signature,
|
_naver_query_signature,
|
||||||
|
|
@ -94,28 +81,16 @@ from rights_filter.server.store_serialization import (
|
||||||
_provider_item_failed,
|
_provider_item_failed,
|
||||||
_provider_item_has_result,
|
_provider_item_has_result,
|
||||||
_query_history_status,
|
_query_history_status,
|
||||||
_safe_filename,
|
|
||||||
_safe_image_suffix,
|
|
||||||
_stable_id,
|
_stable_id,
|
||||||
_strip_html,
|
|
||||||
_submission_payload,
|
_submission_payload,
|
||||||
_submission_search_hint_evidence,
|
_submission_search_hint_evidence,
|
||||||
_timestamp_id,
|
_timestamp_id,
|
||||||
_validate_payload,
|
|
||||||
_validate_table,
|
|
||||||
_watchlist_source_evidence,
|
_watchlist_source_evidence,
|
||||||
)
|
)
|
||||||
from rights_filter.server.store_text import _text_list, _unique_texts
|
from rights_filter.server.store_text import _text_list, _unique_texts
|
||||||
from rights_filter.server.store_url_utils import (
|
|
||||||
_decoded_nested_url,
|
|
||||||
_is_http_url,
|
|
||||||
_url_has_image_format_hint,
|
|
||||||
_url_looks_like_image,
|
|
||||||
_url_path_has_image_suffix,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class CopyrighterStore(StorePersistenceMixin):
|
class CopyrighterStore(StorePersistenceMixin, StoreSearchCandidatesMixin):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
db_path: Path | str,
|
db_path: Path | str,
|
||||||
|
|
@ -2379,694 +2354,3 @@ class CopyrighterStore(StorePersistenceMixin):
|
||||||
f"auto text query batch: {', '.join(item['query'] for item in history_entries)}",
|
f"auto text query batch: {', '.join(item['query'] for item in history_entries)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _knowledge_repository(self) -> InMemoryRightsFilterRepository:
|
|
||||||
repository = InMemoryRightsFilterRepository()
|
|
||||||
for payload in self._all("knowledge_entries"):
|
|
||||||
if not payload.get("active", True):
|
|
||||||
continue
|
|
||||||
if payload.get("entryStatus") == "excluded":
|
|
||||||
continue
|
|
||||||
sample_fingerprints = _text_list(
|
|
||||||
payload.get("sampleFingerprints", payload.get("sample_fingerprints", []))
|
|
||||||
)
|
|
||||||
if not sample_fingerprints:
|
|
||||||
continue
|
|
||||||
repository.save_knowledge_entry(
|
|
||||||
KnowledgeBaseEntry(
|
|
||||||
id=str(payload.get("id", "")),
|
|
||||||
entry_type=_knowledge_entry_type(str(payload.get("type", "other"))),
|
|
||||||
name=str(payload.get("name", "")),
|
|
||||||
provenance=_knowledge_provenance(str(payload.get("provenance", "manual"))),
|
|
||||||
aliases=_text_list(payload.get("aliases")),
|
|
||||||
related_keywords=_text_list(payload.get("keywords")),
|
|
||||||
policy_memo=str(payload.get("memo", "")),
|
|
||||||
sample_fingerprints=sample_fingerprints,
|
|
||||||
source_decision_id=str(payload.get("sourceDecision", "")) or None,
|
|
||||||
entry_status=str(payload.get("entryStatus", "confirmed")),
|
|
||||||
source_submission_id=str(payload.get("sourceSubmissionId", "")),
|
|
||||||
active=bool(payload.get("active", True)),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return repository
|
|
||||||
|
|
||||||
def _sync_similar_reference_images(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
evidence: list[Evidence],
|
|
||||||
) -> None:
|
|
||||||
matched_entry_ids = [
|
|
||||||
str(item.data.get("knowledge_entry_id", ""))
|
|
||||||
for item in evidence
|
|
||||||
if item.source == EvidenceSource.FINGERPRINT and item.data.get("knowledge_entry_id")
|
|
||||||
]
|
|
||||||
if not matched_entry_ids:
|
|
||||||
return
|
|
||||||
|
|
||||||
submission = self._get("submissions", submission_id)
|
|
||||||
similar = list(submission.get("similar", []))
|
|
||||||
existing_assets = {str(item.get("asset", "")) for item in similar}
|
|
||||||
for entry_id in matched_entry_ids:
|
|
||||||
try:
|
|
||||||
entry = self._get("knowledge_entries", entry_id)
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
asset = str(entry.get("imageAsset", ""))
|
|
||||||
if not asset or asset in existing_assets:
|
|
||||||
continue
|
|
||||||
similar.append(
|
|
||||||
{
|
|
||||||
"asset": asset,
|
|
||||||
"label": f"{entry.get('name', entry_id)} / internal match",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
existing_assets.add(asset)
|
|
||||||
submission["similar"] = similar
|
|
||||||
self._put("submissions", submission_id, submission)
|
|
||||||
|
|
||||||
def _sync_search_result_image_similarity(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
evidence: list[Evidence],
|
|
||||||
image_store: LocalSubmissionImageStore,
|
|
||||||
status: str = "active",
|
|
||||||
max_matches: int | None = None,
|
|
||||||
) -> list[Evidence]:
|
|
||||||
submission_fingerprint = self._submission_perceptual_fingerprint(
|
|
||||||
submission_id,
|
|
||||||
image_store,
|
|
||||||
)
|
|
||||||
if submission_fingerprint is None:
|
|
||||||
return []
|
|
||||||
|
|
||||||
if max_matches is None:
|
|
||||||
max_matches = self.provider_runtime.search_result_compare_limit
|
|
||||||
else:
|
|
||||||
max_matches = min(
|
|
||||||
max_matches,
|
|
||||||
self.provider_runtime.search_result_compare_limit,
|
|
||||||
)
|
|
||||||
if max_matches <= 0:
|
|
||||||
return []
|
|
||||||
|
|
||||||
similarity_evidence: list[Evidence] = []
|
|
||||||
for item in evidence:
|
|
||||||
if len(similarity_evidence) >= max_matches:
|
|
||||||
break
|
|
||||||
matches = self._search_result_image_similarity_evidence(
|
|
||||||
submission_id,
|
|
||||||
submission_fingerprint,
|
|
||||||
item,
|
|
||||||
)
|
|
||||||
if not matches:
|
|
||||||
continue
|
|
||||||
for match in matches:
|
|
||||||
if len(similarity_evidence) >= max_matches:
|
|
||||||
break
|
|
||||||
payload = _evidence_payload(submission_id, match)
|
|
||||||
payload["status"] = status
|
|
||||||
self._put("evidence", payload["id"], payload)
|
|
||||||
similarity_evidence.append(match)
|
|
||||||
if similarity_evidence:
|
|
||||||
self._rescore_submission(submission_id)
|
|
||||||
return similarity_evidence
|
|
||||||
|
|
||||||
def _can_compare_search_result_images(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
image_store: LocalSubmissionImageStore | None,
|
|
||||||
) -> bool:
|
|
||||||
if image_store is None:
|
|
||||||
return False
|
|
||||||
return self._submission_perceptual_fingerprint(submission_id, image_store) is not None
|
|
||||||
|
|
||||||
def _search_result_similarity_count(self, submission_id: str) -> int:
|
|
||||||
return sum(
|
|
||||||
1
|
|
||||||
for item in self._evidence_by_submission().get(submission_id, [])
|
|
||||||
if item.get("source") == "fingerprint"
|
|
||||||
and str(item.get("matchType") or "").startswith("search_result")
|
|
||||||
)
|
|
||||||
|
|
||||||
def _search_result_similarity_remaining_budget(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
image_store: LocalSubmissionImageStore | None,
|
|
||||||
) -> int:
|
|
||||||
if not self._can_compare_search_result_images(submission_id, image_store):
|
|
||||||
return 0
|
|
||||||
return max(
|
|
||||||
0,
|
|
||||||
self.provider_runtime.search_result_compare_limit
|
|
||||||
- self._search_result_similarity_count(submission_id),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _submission_perceptual_fingerprint(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
image_store: LocalSubmissionImageStore,
|
|
||||||
) -> str | None:
|
|
||||||
try:
|
|
||||||
fingerprint = FingerprintService().fingerprints_for(
|
|
||||||
image_store.image_payload(submission_id).content
|
|
||||||
).perceptual
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
if fingerprint.startswith("phash:unavailable:"):
|
|
||||||
return None
|
|
||||||
return fingerprint
|
|
||||||
|
|
||||||
def _search_result_image_similarity_evidence(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
submission_fingerprint: str,
|
|
||||||
source_evidence: Evidence,
|
|
||||||
) -> list[Evidence]:
|
|
||||||
if source_evidence.source not in {EvidenceSource.NAVER_SEARCH, EvidenceSource.WEB_DETECTION}:
|
|
||||||
return []
|
|
||||||
if source_evidence.data.get("weak_hint"):
|
|
||||||
return []
|
|
||||||
|
|
||||||
matches: list[Evidence] = []
|
|
||||||
|
|
||||||
for image_url in _unique_texts(
|
|
||||||
[
|
|
||||||
str(source_evidence.data.get("image_url", "")),
|
|
||||||
str(source_evidence.data.get("thumbnail_url", "")),
|
|
||||||
]
|
|
||||||
):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
submission_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="search_result_image",
|
|
||||||
candidate_source="result_image_url",
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url in _search_result_direct_image_urls(source_evidence):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
submission_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="search_result_page_image",
|
|
||||||
candidate_source="result_page_direct_image",
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url in _unique_texts(source_evidence.data.get("page_image_urls", [])):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
submission_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="search_result_page_image",
|
|
||||||
candidate_source="provider_page_image",
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url, candidate_source in self._search_result_page_image_candidates(source_evidence):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
submission_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="search_result_page_image",
|
|
||||||
candidate_source=candidate_source,
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
return matches
|
|
||||||
|
|
||||||
def _face_crop_search_result_similarity_evidence(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
crop_index: int,
|
|
||||||
crop: Any,
|
|
||||||
source_evidence: Evidence,
|
|
||||||
) -> list[Evidence]:
|
|
||||||
try:
|
|
||||||
crop_fingerprint = FingerprintService().fingerprints_for(crop.content).perceptual
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
if crop_fingerprint.startswith("phash:unavailable:"):
|
|
||||||
return []
|
|
||||||
|
|
||||||
matches: list[Evidence] = []
|
|
||||||
|
|
||||||
extra_data = {
|
|
||||||
"face_crop_search": True,
|
|
||||||
"crop_index": crop_index,
|
|
||||||
"weak_hint": True,
|
|
||||||
"privacy_note": "얼굴 영역만 웹 탐지한 참고 근거이며 동일인 판정이 아닙니다.",
|
|
||||||
}
|
|
||||||
for image_url in _unique_texts(
|
|
||||||
[
|
|
||||||
str(source_evidence.data.get("image_url", "")),
|
|
||||||
str(source_evidence.data.get("thumbnail_url", "")),
|
|
||||||
]
|
|
||||||
):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
crop_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="face_crop_search_result_image",
|
|
||||||
candidate_source="face_crop_result_image_url",
|
|
||||||
extra_data=extra_data,
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url in _search_result_direct_image_urls(source_evidence):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
crop_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="face_crop_search_result_page_image",
|
|
||||||
candidate_source="face_crop_result_page_direct_image",
|
|
||||||
extra_data=extra_data,
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url in _unique_texts(source_evidence.data.get("page_image_urls", [])):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
crop_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="face_crop_search_result_page_image",
|
|
||||||
candidate_source="face_crop_provider_page_image",
|
|
||||||
extra_data=extra_data,
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
|
|
||||||
for image_url, candidate_source in self._search_result_page_image_candidates(source_evidence):
|
|
||||||
match = self._search_result_candidate_image_evidence(
|
|
||||||
submission_id,
|
|
||||||
crop_fingerprint,
|
|
||||||
source_evidence,
|
|
||||||
image_url,
|
|
||||||
match_type="face_crop_search_result_page_image",
|
|
||||||
candidate_source=f"face_crop_{candidate_source}",
|
|
||||||
extra_data=extra_data,
|
|
||||||
)
|
|
||||||
if match is not None:
|
|
||||||
return [match]
|
|
||||||
return matches
|
|
||||||
|
|
||||||
def _search_result_candidate_image_evidence(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
submission_fingerprint: str,
|
|
||||||
source_evidence: Evidence,
|
|
||||||
image_url: str,
|
|
||||||
match_type: str,
|
|
||||||
candidate_source: str,
|
|
||||||
extra_data: dict[str, Any] | None = None,
|
|
||||||
) -> Evidence | None:
|
|
||||||
image_url = _normalized_remote_image_url(image_url)
|
|
||||||
result_url = str(
|
|
||||||
source_evidence.data.get("result_url", source_evidence.data.get("url", ""))
|
|
||||||
or image_url
|
|
||||||
)
|
|
||||||
image_id = _stable_id(
|
|
||||||
"searchimg",
|
|
||||||
submission_id,
|
|
||||||
str(source_evidence.source),
|
|
||||||
match_type,
|
|
||||||
image_url,
|
|
||||||
str(source_evidence.data.get("query", "")),
|
|
||||||
)
|
|
||||||
image_record = self._store_candidate_image(image_id, image_url, referer_url=result_url)
|
|
||||||
if not image_record:
|
|
||||||
return None
|
|
||||||
|
|
||||||
similarity = FingerprintService().similarity(
|
|
||||||
submission_fingerprint,
|
|
||||||
str(image_record["perceptualFingerprint"]),
|
|
||||||
)
|
|
||||||
if similarity < self.provider_runtime.search_result_similarity_threshold:
|
|
||||||
return None
|
|
||||||
return Evidence(
|
|
||||||
source=EvidenceSource.FINGERPRINT,
|
|
||||||
reason=f"Search result image similarity {similarity:.2f}",
|
|
||||||
confidence=similarity,
|
|
||||||
data={
|
|
||||||
"submission_id": submission_id,
|
|
||||||
"provider": source_evidence.data.get("provider", ""),
|
|
||||||
"query": source_evidence.data.get("query", ""),
|
|
||||||
"query_signature": source_evidence.data.get("query_signature", ""),
|
|
||||||
"query_strategy": source_evidence.data.get("query_strategy", ""),
|
|
||||||
"query_source": source_evidence.data.get("query_source", ""),
|
|
||||||
"url": result_url,
|
|
||||||
"result_url": result_url,
|
|
||||||
"image_url": image_record["asset"],
|
|
||||||
"thumbnail_url": image_record["asset"],
|
|
||||||
"remote_image_url": image_url,
|
|
||||||
"source_page_url": result_url,
|
|
||||||
"image_candidate_source": candidate_source,
|
|
||||||
"page_title": source_evidence.data.get("page_title", source_evidence.data.get("title", "")),
|
|
||||||
"match": match_type,
|
|
||||||
"similarity": similarity,
|
|
||||||
"source_evidence_ids": [_evidence_id(submission_id, source_evidence)],
|
|
||||||
"contributed": True,
|
|
||||||
**(extra_data or {}),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
def _search_result_page_image_candidates(self, source_evidence: Evidence) -> list[tuple[str, str]]:
|
|
||||||
page_url = str(
|
|
||||||
source_evidence.data.get("result_url", source_evidence.data.get("url", ""))
|
|
||||||
)
|
|
||||||
limit = getattr(self.provider_runtime, "search_result_page_image_limit", 3)
|
|
||||||
if not page_url or limit <= 0 or not _is_http_url(page_url):
|
|
||||||
return []
|
|
||||||
if _url_looks_like_image(page_url):
|
|
||||||
return []
|
|
||||||
try:
|
|
||||||
content = self.page_fetcher(page_url)
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
if _content_has_comparable_image_fingerprint(content):
|
|
||||||
return [(page_url, "result_page_direct_image")]
|
|
||||||
image_urls = _extract_page_image_urls(content, page_url, limit)
|
|
||||||
if len(image_urls) < limit:
|
|
||||||
image_urls.extend(
|
|
||||||
self._search_result_stylesheet_image_urls(
|
|
||||||
content,
|
|
||||||
page_url,
|
|
||||||
limit - len(image_urls),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return [(image_url, "html_page_image") for image_url in _unique_texts(image_urls)[:limit]]
|
|
||||||
|
|
||||||
def _search_result_stylesheet_image_urls(
|
|
||||||
self,
|
|
||||||
page_content: bytes,
|
|
||||||
page_url: str,
|
|
||||||
limit: int,
|
|
||||||
) -> list[str]:
|
|
||||||
if limit <= 0:
|
|
||||||
return []
|
|
||||||
image_urls: list[str] = []
|
|
||||||
for stylesheet_url in _extract_page_stylesheet_urls(page_content, page_url, limit):
|
|
||||||
try:
|
|
||||||
stylesheet_content = self.stylesheet_fetcher(stylesheet_url)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
for image_url in _extract_css_image_urls(stylesheet_content, stylesheet_url, limit - len(image_urls)):
|
|
||||||
image_urls.append(image_url)
|
|
||||||
if len(image_urls) >= limit:
|
|
||||||
return image_urls
|
|
||||||
return image_urls
|
|
||||||
|
|
||||||
def _search_result_page_image_urls(self, source_evidence: Evidence) -> list[str]:
|
|
||||||
return [
|
|
||||||
image_url
|
|
||||||
for image_url, _candidate_source in self._search_result_page_image_candidates(source_evidence)
|
|
||||||
]
|
|
||||||
|
|
||||||
def _increment_knowledge_contribution_counts(
|
|
||||||
self,
|
|
||||||
submission_id: str,
|
|
||||||
evidence: list[Evidence],
|
|
||||||
) -> None:
|
|
||||||
matched_entry_ids = _unique_texts(
|
|
||||||
str(item.data.get("knowledge_entry_id", ""))
|
|
||||||
for item in evidence
|
|
||||||
if item.source == EvidenceSource.FINGERPRINT
|
|
||||||
and item.data.get("knowledge_entry_status") == "watchlist"
|
|
||||||
and item.data.get("knowledge_entry_id")
|
|
||||||
)
|
|
||||||
for entry_id in matched_entry_ids:
|
|
||||||
try:
|
|
||||||
entry = self._get("knowledge_entries", entry_id)
|
|
||||||
except KeyError:
|
|
||||||
continue
|
|
||||||
if entry.get("entryStatus") != "watchlist":
|
|
||||||
continue
|
|
||||||
if str(entry.get("sourceSubmissionId", "")) == submission_id:
|
|
||||||
continue
|
|
||||||
matched_submission_ids = _text_list(entry.get("matchedSubmissionIds"))
|
|
||||||
if submission_id in matched_submission_ids:
|
|
||||||
continue
|
|
||||||
matched_submission_ids.append(submission_id)
|
|
||||||
entry["matchedSubmissionIds"] = matched_submission_ids
|
|
||||||
entry["contributionCount"] = int(entry.get("contributionCount", 0) or 0) + 1
|
|
||||||
entry["lastMatchedSubmissionId"] = submission_id
|
|
||||||
entry["lastMatchedAt"] = _now_label()
|
|
||||||
self._put("knowledge_entries", entry_id, entry)
|
|
||||||
|
|
||||||
def _store_manual_knowledge_image(
|
|
||||||
self,
|
|
||||||
entry_id: str,
|
|
||||||
image_payload: Any,
|
|
||||||
) -> dict[str, Any] | None:
|
|
||||||
if not image_payload:
|
|
||||||
return None
|
|
||||||
if not isinstance(image_payload, dict):
|
|
||||||
raise ValueError("knowledge image must be an object")
|
|
||||||
|
|
||||||
data = str(image_payload.get("data", ""))
|
|
||||||
if not data:
|
|
||||||
raise ValueError("knowledge image data required")
|
|
||||||
if "," in data and data.split(",", 1)[0].startswith("data:"):
|
|
||||||
data = data.split(",", 1)[1]
|
|
||||||
try:
|
|
||||||
content = base64.b64decode(data, validate=True)
|
|
||||||
except Exception as exc:
|
|
||||||
raise ValueError("knowledge image data must be base64") from exc
|
|
||||||
if not content:
|
|
||||||
raise ValueError("knowledge image is empty")
|
|
||||||
|
|
||||||
filename = str(image_payload.get("filename", "reference")).strip() or "reference"
|
|
||||||
suffix = _safe_image_suffix(filename, str(image_payload.get("content_type", "")))
|
|
||||||
safe_stem = _safe_filename(Path(filename).stem) or "reference"
|
|
||||||
target_name = f"{entry_id}-{safe_stem}{suffix}"
|
|
||||||
self.knowledge_image_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
root = self.knowledge_image_dir.resolve()
|
|
||||||
target = (root / target_name).resolve()
|
|
||||||
if target != root and root not in target.parents:
|
|
||||||
raise ValueError("knowledge image path points outside image store")
|
|
||||||
target.write_bytes(content)
|
|
||||||
|
|
||||||
width, height = _image_size_from_bytes(content)
|
|
||||||
fingerprints = FingerprintService().fingerprints_for(content)
|
|
||||||
return {
|
|
||||||
"asset": f"{self.knowledge_public_prefix}/{target_name}",
|
|
||||||
"perceptualFingerprint": fingerprints.perceptual,
|
|
||||||
"facts": {
|
|
||||||
"filename": filename,
|
|
||||||
"format": suffix.lstrip(".").upper(),
|
|
||||||
"size": f"{width} x {height}",
|
|
||||||
"fingerprints": 1,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _collection_candidates_from_evidence(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
evidence: list[Evidence],
|
|
||||||
provider: str,
|
|
||||||
) -> list[dict[str, Any]]:
|
|
||||||
candidates: list[dict[str, Any]] = []
|
|
||||||
for item in evidence:
|
|
||||||
if item.source not in {EvidenceSource.NAVER_SEARCH, EvidenceSource.WEB_DETECTION}:
|
|
||||||
continue
|
|
||||||
if item.data.get("image_url"):
|
|
||||||
candidate = self._candidate_payload_from_evidence(
|
|
||||||
query,
|
|
||||||
item,
|
|
||||||
provider,
|
|
||||||
source_candidate_type="search_result_image",
|
|
||||||
)
|
|
||||||
if candidate is not None:
|
|
||||||
candidates.append(candidate)
|
|
||||||
continue
|
|
||||||
candidate_count = len(candidates)
|
|
||||||
for image_url in _unique_texts(item.data.get("page_image_urls", [])):
|
|
||||||
candidate = self._candidate_payload_from_evidence(
|
|
||||||
query,
|
|
||||||
item,
|
|
||||||
provider,
|
|
||||||
image_url=image_url,
|
|
||||||
thumbnail_url=image_url,
|
|
||||||
source_candidate_type="provider_page_image",
|
|
||||||
)
|
|
||||||
if candidate is not None:
|
|
||||||
candidates.append(candidate)
|
|
||||||
if len(candidates) > candidate_count:
|
|
||||||
continue
|
|
||||||
for image_url in _search_result_direct_image_urls(item):
|
|
||||||
candidate = self._candidate_payload_from_evidence(
|
|
||||||
query,
|
|
||||||
item,
|
|
||||||
provider,
|
|
||||||
image_url=image_url,
|
|
||||||
thumbnail_url=image_url,
|
|
||||||
source_candidate_type="result_page_direct_image",
|
|
||||||
)
|
|
||||||
if candidate is not None:
|
|
||||||
candidates.append(candidate)
|
|
||||||
if len(candidates) > candidate_count:
|
|
||||||
continue
|
|
||||||
for image_url, source_candidate_type in self._search_result_page_image_candidates(item):
|
|
||||||
candidate = self._candidate_payload_from_evidence(
|
|
||||||
query,
|
|
||||||
item,
|
|
||||||
provider,
|
|
||||||
image_url=image_url,
|
|
||||||
thumbnail_url=image_url,
|
|
||||||
source_candidate_type=source_candidate_type,
|
|
||||||
)
|
|
||||||
if candidate is not None:
|
|
||||||
candidates.append(candidate)
|
|
||||||
break
|
|
||||||
return candidates
|
|
||||||
|
|
||||||
def _candidate_payload_from_evidence(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
evidence: Evidence,
|
|
||||||
provider: str = "naver",
|
|
||||||
image_url: str | None = None,
|
|
||||||
thumbnail_url: str | None = None,
|
|
||||||
source_candidate_type: str = "search_result_image",
|
|
||||||
) -> dict[str, Any] | None:
|
|
||||||
image_url = _normalized_remote_image_url(
|
|
||||||
str(image_url if image_url is not None else evidence.data.get("image_url", ""))
|
|
||||||
)
|
|
||||||
thumbnail_url = _normalized_remote_image_url(
|
|
||||||
str(thumbnail_url if thumbnail_url is not None else evidence.data.get("thumbnail_url", ""))
|
|
||||||
)
|
|
||||||
result_url = str(evidence.data.get("result_url", ""))
|
|
||||||
candidate_id = _stable_id("cand", provider, source_candidate_type, query, image_url, thumbnail_url, result_url)
|
|
||||||
image_record = None
|
|
||||||
stored_image_url = ""
|
|
||||||
for candidate_url in _unique_texts([image_url, thumbnail_url]):
|
|
||||||
image_record = self._store_candidate_image(
|
|
||||||
candidate_id,
|
|
||||||
candidate_url,
|
|
||||||
referer_url=result_url,
|
|
||||||
)
|
|
||||||
if image_record is not None:
|
|
||||||
stored_image_url = candidate_url
|
|
||||||
break
|
|
||||||
if image_record is None:
|
|
||||||
return None
|
|
||||||
display_image_url = stored_image_url or image_url
|
|
||||||
return {
|
|
||||||
"id": candidate_id,
|
|
||||||
"provider": provider,
|
|
||||||
"query": query,
|
|
||||||
"title": _strip_html(str(evidence.data.get("title", ""))),
|
|
||||||
"status": "candidate",
|
|
||||||
"rank": evidence.data.get("rank", ""),
|
|
||||||
"imageUrl": display_image_url,
|
|
||||||
"thumbnailUrl": thumbnail_url,
|
|
||||||
"resultUrl": result_url,
|
|
||||||
"sourceUrl": result_url or display_image_url,
|
|
||||||
"sourceCandidateType": source_candidate_type,
|
|
||||||
"imageAsset": image_record["asset"],
|
|
||||||
"sampleFingerprints": [image_record["perceptualFingerprint"]],
|
|
||||||
"imageFacts": image_record["facts"],
|
|
||||||
"collectedAt": _now_label(),
|
|
||||||
"collectedEpoch": int(datetime.now().timestamp()),
|
|
||||||
"promotedKnowledgeId": "",
|
|
||||||
}
|
|
||||||
|
|
||||||
def _store_candidate_image(
|
|
||||||
self,
|
|
||||||
candidate_id: str,
|
|
||||||
url: str,
|
|
||||||
referer_url: str = "",
|
|
||||||
) -> dict[str, Any] | None:
|
|
||||||
if not url:
|
|
||||||
return None
|
|
||||||
suffix = _image_suffix_from_url(url)
|
|
||||||
target_name = f"{candidate_id}{suffix}"
|
|
||||||
root = self.collection_image_dir.resolve()
|
|
||||||
target = (root / target_name).resolve()
|
|
||||||
if target != root and root not in target.parents:
|
|
||||||
raise ValueError("candidate image path points outside image store")
|
|
||||||
if target.exists() and target.is_file():
|
|
||||||
try:
|
|
||||||
record = self._candidate_image_record_from_content(
|
|
||||||
target_name,
|
|
||||||
url,
|
|
||||||
suffix,
|
|
||||||
target.read_bytes(),
|
|
||||||
)
|
|
||||||
except Exception:
|
|
||||||
record = None
|
|
||||||
if record is not None:
|
|
||||||
return record
|
|
||||||
try:
|
|
||||||
content = self._fetch_candidate_image_content(url, referer_url)
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
image_record = self._candidate_image_record_from_content(
|
|
||||||
target_name,
|
|
||||||
url,
|
|
||||||
suffix,
|
|
||||||
content,
|
|
||||||
)
|
|
||||||
if image_record is None:
|
|
||||||
return None
|
|
||||||
self.collection_image_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
target.write_bytes(content)
|
|
||||||
return image_record
|
|
||||||
|
|
||||||
def _candidate_image_record_from_content(
|
|
||||||
self,
|
|
||||||
target_name: str,
|
|
||||||
url: str,
|
|
||||||
suffix: str,
|
|
||||||
content: bytes,
|
|
||||||
) -> dict[str, Any] | None:
|
|
||||||
if not content:
|
|
||||||
return None
|
|
||||||
width, height = _image_size_from_bytes(content)
|
|
||||||
fingerprints = FingerprintService().fingerprints_for(content)
|
|
||||||
if fingerprints.perceptual.startswith("phash:unavailable:"):
|
|
||||||
return None
|
|
||||||
return {
|
|
||||||
"asset": f"{self.collection_public_prefix}/{target_name}",
|
|
||||||
"perceptualFingerprint": fingerprints.perceptual,
|
|
||||||
"facts": {
|
|
||||||
"source": url,
|
|
||||||
"format": suffix.lstrip(".").upper(),
|
|
||||||
"size": f"{width} x {height}",
|
|
||||||
"fingerprints": 1,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _fetch_candidate_image_content(self, url: str, referer_url: str = "") -> bytes:
|
|
||||||
if self._custom_candidate_image_fetcher is not None:
|
|
||||||
return self._custom_candidate_image_fetcher(url)
|
|
||||||
return _fetch_url_bytes(url, referer_url=referer_url)
|
|
||||||
|
|
||||||
def _rescore_submission(self, submission_id: str) -> None:
|
|
||||||
submission = self._get("submissions", submission_id)
|
|
||||||
evidence = [
|
|
||||||
_domain_evidence_from_ui(item)
|
|
||||||
for item in self._evidence_for_submission(submission_id)
|
|
||||||
]
|
|
||||||
score = RiskScorer().score(evidence)
|
|
||||||
submission["riskScore"] = score.score
|
|
||||||
submission["riskBand"] = score.band
|
|
||||||
submission["reasons"] = score.reasons or ["분석 근거 없음"]
|
|
||||||
self._put("submissions", submission_id, submission)
|
|
||||||
|
|
||||||
def _rescore_all_submissions(self, queue_id: str | None = None) -> None:
|
|
||||||
for submission in self._all("submissions", queue_id=queue_id):
|
|
||||||
self._rescore_submission(str(submission["id"]))
|
|
||||||
|
|
||||||
|
|
|
||||||
743
src/rights_filter/server/store_search_candidates.py
Normal file
743
src/rights_filter/server/store_search_candidates.py
Normal file
|
|
@ -0,0 +1,743 @@
|
||||||
|
"""Search-result image similarity, candidate-image storage, the in-memory
|
||||||
|
knowledge repository, and rescoring — as a mixin for CopyrighterStore.
|
||||||
|
|
||||||
|
Mixed into CopyrighterStore; relies on persistence methods (self._put/_get/...),
|
||||||
|
self.* attributes, and the extracted helper modules. Behavior unchanged.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from rights_filter.analysis.fingerprints import FingerprintService
|
||||||
|
from rights_filter.analysis.risk_scoring import RiskScorer
|
||||||
|
from rights_filter.domain.records import (
|
||||||
|
Evidence,
|
||||||
|
EvidenceSource,
|
||||||
|
InMemoryRightsFilterRepository,
|
||||||
|
KnowledgeBaseEntry,
|
||||||
|
)
|
||||||
|
from rights_filter.server.image_store import LocalSubmissionImageStore
|
||||||
|
from rights_filter.server.store_page_scrape import (
|
||||||
|
_content_has_comparable_image_fingerprint,
|
||||||
|
_extract_css_image_urls,
|
||||||
|
_extract_page_image_urls,
|
||||||
|
_extract_page_stylesheet_urls,
|
||||||
|
_normalized_remote_image_url,
|
||||||
|
_search_result_direct_image_urls,
|
||||||
|
)
|
||||||
|
from rights_filter.server.store_remote_fetch import _fetch_url_bytes
|
||||||
|
from rights_filter.server.store_serialization import (
|
||||||
|
_domain_evidence_from_ui,
|
||||||
|
_evidence_id,
|
||||||
|
_evidence_payload,
|
||||||
|
_image_size_from_bytes,
|
||||||
|
_image_suffix_from_url,
|
||||||
|
_knowledge_entry_type,
|
||||||
|
_knowledge_provenance,
|
||||||
|
_now_label,
|
||||||
|
_safe_filename,
|
||||||
|
_safe_image_suffix,
|
||||||
|
_stable_id,
|
||||||
|
_strip_html,
|
||||||
|
)
|
||||||
|
from rights_filter.server.store_text import _text_list, _unique_texts
|
||||||
|
from rights_filter.server.store_url_utils import _is_http_url, _url_looks_like_image
|
||||||
|
|
||||||
|
|
||||||
|
class StoreSearchCandidatesMixin:
|
||||||
|
def _knowledge_repository(self) -> InMemoryRightsFilterRepository:
|
||||||
|
repository = InMemoryRightsFilterRepository()
|
||||||
|
for payload in self._all("knowledge_entries"):
|
||||||
|
if not payload.get("active", True):
|
||||||
|
continue
|
||||||
|
if payload.get("entryStatus") == "excluded":
|
||||||
|
continue
|
||||||
|
sample_fingerprints = _text_list(
|
||||||
|
payload.get("sampleFingerprints", payload.get("sample_fingerprints", []))
|
||||||
|
)
|
||||||
|
if not sample_fingerprints:
|
||||||
|
continue
|
||||||
|
repository.save_knowledge_entry(
|
||||||
|
KnowledgeBaseEntry(
|
||||||
|
id=str(payload.get("id", "")),
|
||||||
|
entry_type=_knowledge_entry_type(str(payload.get("type", "other"))),
|
||||||
|
name=str(payload.get("name", "")),
|
||||||
|
provenance=_knowledge_provenance(str(payload.get("provenance", "manual"))),
|
||||||
|
aliases=_text_list(payload.get("aliases")),
|
||||||
|
related_keywords=_text_list(payload.get("keywords")),
|
||||||
|
policy_memo=str(payload.get("memo", "")),
|
||||||
|
sample_fingerprints=sample_fingerprints,
|
||||||
|
source_decision_id=str(payload.get("sourceDecision", "")) or None,
|
||||||
|
entry_status=str(payload.get("entryStatus", "confirmed")),
|
||||||
|
source_submission_id=str(payload.get("sourceSubmissionId", "")),
|
||||||
|
active=bool(payload.get("active", True)),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return repository
|
||||||
|
|
||||||
|
def _sync_similar_reference_images(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
evidence: list[Evidence],
|
||||||
|
) -> None:
|
||||||
|
matched_entry_ids = [
|
||||||
|
str(item.data.get("knowledge_entry_id", ""))
|
||||||
|
for item in evidence
|
||||||
|
if item.source == EvidenceSource.FINGERPRINT and item.data.get("knowledge_entry_id")
|
||||||
|
]
|
||||||
|
if not matched_entry_ids:
|
||||||
|
return
|
||||||
|
|
||||||
|
submission = self._get("submissions", submission_id)
|
||||||
|
similar = list(submission.get("similar", []))
|
||||||
|
existing_assets = {str(item.get("asset", "")) for item in similar}
|
||||||
|
for entry_id in matched_entry_ids:
|
||||||
|
try:
|
||||||
|
entry = self._get("knowledge_entries", entry_id)
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
asset = str(entry.get("imageAsset", ""))
|
||||||
|
if not asset or asset in existing_assets:
|
||||||
|
continue
|
||||||
|
similar.append(
|
||||||
|
{
|
||||||
|
"asset": asset,
|
||||||
|
"label": f"{entry.get('name', entry_id)} / internal match",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
existing_assets.add(asset)
|
||||||
|
submission["similar"] = similar
|
||||||
|
self._put("submissions", submission_id, submission)
|
||||||
|
|
||||||
|
def _sync_search_result_image_similarity(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
evidence: list[Evidence],
|
||||||
|
image_store: LocalSubmissionImageStore,
|
||||||
|
status: str = "active",
|
||||||
|
max_matches: int | None = None,
|
||||||
|
) -> list[Evidence]:
|
||||||
|
submission_fingerprint = self._submission_perceptual_fingerprint(
|
||||||
|
submission_id,
|
||||||
|
image_store,
|
||||||
|
)
|
||||||
|
if submission_fingerprint is None:
|
||||||
|
return []
|
||||||
|
|
||||||
|
if max_matches is None:
|
||||||
|
max_matches = self.provider_runtime.search_result_compare_limit
|
||||||
|
else:
|
||||||
|
max_matches = min(
|
||||||
|
max_matches,
|
||||||
|
self.provider_runtime.search_result_compare_limit,
|
||||||
|
)
|
||||||
|
if max_matches <= 0:
|
||||||
|
return []
|
||||||
|
|
||||||
|
similarity_evidence: list[Evidence] = []
|
||||||
|
for item in evidence:
|
||||||
|
if len(similarity_evidence) >= max_matches:
|
||||||
|
break
|
||||||
|
matches = self._search_result_image_similarity_evidence(
|
||||||
|
submission_id,
|
||||||
|
submission_fingerprint,
|
||||||
|
item,
|
||||||
|
)
|
||||||
|
if not matches:
|
||||||
|
continue
|
||||||
|
for match in matches:
|
||||||
|
if len(similarity_evidence) >= max_matches:
|
||||||
|
break
|
||||||
|
payload = _evidence_payload(submission_id, match)
|
||||||
|
payload["status"] = status
|
||||||
|
self._put("evidence", payload["id"], payload)
|
||||||
|
similarity_evidence.append(match)
|
||||||
|
if similarity_evidence:
|
||||||
|
self._rescore_submission(submission_id)
|
||||||
|
return similarity_evidence
|
||||||
|
|
||||||
|
def _can_compare_search_result_images(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
image_store: LocalSubmissionImageStore | None,
|
||||||
|
) -> bool:
|
||||||
|
if image_store is None:
|
||||||
|
return False
|
||||||
|
return self._submission_perceptual_fingerprint(submission_id, image_store) is not None
|
||||||
|
|
||||||
|
def _search_result_similarity_count(self, submission_id: str) -> int:
|
||||||
|
return sum(
|
||||||
|
1
|
||||||
|
for item in self._evidence_by_submission().get(submission_id, [])
|
||||||
|
if item.get("source") == "fingerprint"
|
||||||
|
and str(item.get("matchType") or "").startswith("search_result")
|
||||||
|
)
|
||||||
|
|
||||||
|
def _search_result_similarity_remaining_budget(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
image_store: LocalSubmissionImageStore | None,
|
||||||
|
) -> int:
|
||||||
|
if not self._can_compare_search_result_images(submission_id, image_store):
|
||||||
|
return 0
|
||||||
|
return max(
|
||||||
|
0,
|
||||||
|
self.provider_runtime.search_result_compare_limit
|
||||||
|
- self._search_result_similarity_count(submission_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _submission_perceptual_fingerprint(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
image_store: LocalSubmissionImageStore,
|
||||||
|
) -> str | None:
|
||||||
|
try:
|
||||||
|
fingerprint = FingerprintService().fingerprints_for(
|
||||||
|
image_store.image_payload(submission_id).content
|
||||||
|
).perceptual
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if fingerprint.startswith("phash:unavailable:"):
|
||||||
|
return None
|
||||||
|
return fingerprint
|
||||||
|
|
||||||
|
def _search_result_image_similarity_evidence(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
submission_fingerprint: str,
|
||||||
|
source_evidence: Evidence,
|
||||||
|
) -> list[Evidence]:
|
||||||
|
if source_evidence.source not in {EvidenceSource.NAVER_SEARCH, EvidenceSource.WEB_DETECTION}:
|
||||||
|
return []
|
||||||
|
if source_evidence.data.get("weak_hint"):
|
||||||
|
return []
|
||||||
|
|
||||||
|
matches: list[Evidence] = []
|
||||||
|
|
||||||
|
for image_url in _unique_texts(
|
||||||
|
[
|
||||||
|
str(source_evidence.data.get("image_url", "")),
|
||||||
|
str(source_evidence.data.get("thumbnail_url", "")),
|
||||||
|
]
|
||||||
|
):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
submission_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="search_result_image",
|
||||||
|
candidate_source="result_image_url",
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url in _search_result_direct_image_urls(source_evidence):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
submission_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="search_result_page_image",
|
||||||
|
candidate_source="result_page_direct_image",
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url in _unique_texts(source_evidence.data.get("page_image_urls", [])):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
submission_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="search_result_page_image",
|
||||||
|
candidate_source="provider_page_image",
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url, candidate_source in self._search_result_page_image_candidates(source_evidence):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
submission_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="search_result_page_image",
|
||||||
|
candidate_source=candidate_source,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def _face_crop_search_result_similarity_evidence(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
crop_index: int,
|
||||||
|
crop: Any,
|
||||||
|
source_evidence: Evidence,
|
||||||
|
) -> list[Evidence]:
|
||||||
|
try:
|
||||||
|
crop_fingerprint = FingerprintService().fingerprints_for(crop.content).perceptual
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
if crop_fingerprint.startswith("phash:unavailable:"):
|
||||||
|
return []
|
||||||
|
|
||||||
|
matches: list[Evidence] = []
|
||||||
|
|
||||||
|
extra_data = {
|
||||||
|
"face_crop_search": True,
|
||||||
|
"crop_index": crop_index,
|
||||||
|
"weak_hint": True,
|
||||||
|
"privacy_note": "얼굴 영역만 웹 탐지한 참고 근거이며 동일인 판정이 아닙니다.",
|
||||||
|
}
|
||||||
|
for image_url in _unique_texts(
|
||||||
|
[
|
||||||
|
str(source_evidence.data.get("image_url", "")),
|
||||||
|
str(source_evidence.data.get("thumbnail_url", "")),
|
||||||
|
]
|
||||||
|
):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
crop_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="face_crop_search_result_image",
|
||||||
|
candidate_source="face_crop_result_image_url",
|
||||||
|
extra_data=extra_data,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url in _search_result_direct_image_urls(source_evidence):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
crop_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="face_crop_search_result_page_image",
|
||||||
|
candidate_source="face_crop_result_page_direct_image",
|
||||||
|
extra_data=extra_data,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url in _unique_texts(source_evidence.data.get("page_image_urls", [])):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
crop_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="face_crop_search_result_page_image",
|
||||||
|
candidate_source="face_crop_provider_page_image",
|
||||||
|
extra_data=extra_data,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
|
||||||
|
for image_url, candidate_source in self._search_result_page_image_candidates(source_evidence):
|
||||||
|
match = self._search_result_candidate_image_evidence(
|
||||||
|
submission_id,
|
||||||
|
crop_fingerprint,
|
||||||
|
source_evidence,
|
||||||
|
image_url,
|
||||||
|
match_type="face_crop_search_result_page_image",
|
||||||
|
candidate_source=f"face_crop_{candidate_source}",
|
||||||
|
extra_data=extra_data,
|
||||||
|
)
|
||||||
|
if match is not None:
|
||||||
|
return [match]
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def _search_result_candidate_image_evidence(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
submission_fingerprint: str,
|
||||||
|
source_evidence: Evidence,
|
||||||
|
image_url: str,
|
||||||
|
match_type: str,
|
||||||
|
candidate_source: str,
|
||||||
|
extra_data: dict[str, Any] | None = None,
|
||||||
|
) -> Evidence | None:
|
||||||
|
image_url = _normalized_remote_image_url(image_url)
|
||||||
|
result_url = str(
|
||||||
|
source_evidence.data.get("result_url", source_evidence.data.get("url", ""))
|
||||||
|
or image_url
|
||||||
|
)
|
||||||
|
image_id = _stable_id(
|
||||||
|
"searchimg",
|
||||||
|
submission_id,
|
||||||
|
str(source_evidence.source),
|
||||||
|
match_type,
|
||||||
|
image_url,
|
||||||
|
str(source_evidence.data.get("query", "")),
|
||||||
|
)
|
||||||
|
image_record = self._store_candidate_image(image_id, image_url, referer_url=result_url)
|
||||||
|
if not image_record:
|
||||||
|
return None
|
||||||
|
|
||||||
|
similarity = FingerprintService().similarity(
|
||||||
|
submission_fingerprint,
|
||||||
|
str(image_record["perceptualFingerprint"]),
|
||||||
|
)
|
||||||
|
if similarity < self.provider_runtime.search_result_similarity_threshold:
|
||||||
|
return None
|
||||||
|
return Evidence(
|
||||||
|
source=EvidenceSource.FINGERPRINT,
|
||||||
|
reason=f"Search result image similarity {similarity:.2f}",
|
||||||
|
confidence=similarity,
|
||||||
|
data={
|
||||||
|
"submission_id": submission_id,
|
||||||
|
"provider": source_evidence.data.get("provider", ""),
|
||||||
|
"query": source_evidence.data.get("query", ""),
|
||||||
|
"query_signature": source_evidence.data.get("query_signature", ""),
|
||||||
|
"query_strategy": source_evidence.data.get("query_strategy", ""),
|
||||||
|
"query_source": source_evidence.data.get("query_source", ""),
|
||||||
|
"url": result_url,
|
||||||
|
"result_url": result_url,
|
||||||
|
"image_url": image_record["asset"],
|
||||||
|
"thumbnail_url": image_record["asset"],
|
||||||
|
"remote_image_url": image_url,
|
||||||
|
"source_page_url": result_url,
|
||||||
|
"image_candidate_source": candidate_source,
|
||||||
|
"page_title": source_evidence.data.get("page_title", source_evidence.data.get("title", "")),
|
||||||
|
"match": match_type,
|
||||||
|
"similarity": similarity,
|
||||||
|
"source_evidence_ids": [_evidence_id(submission_id, source_evidence)],
|
||||||
|
"contributed": True,
|
||||||
|
**(extra_data or {}),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _search_result_page_image_candidates(self, source_evidence: Evidence) -> list[tuple[str, str]]:
|
||||||
|
page_url = str(
|
||||||
|
source_evidence.data.get("result_url", source_evidence.data.get("url", ""))
|
||||||
|
)
|
||||||
|
limit = getattr(self.provider_runtime, "search_result_page_image_limit", 3)
|
||||||
|
if not page_url or limit <= 0 or not _is_http_url(page_url):
|
||||||
|
return []
|
||||||
|
if _url_looks_like_image(page_url):
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
content = self.page_fetcher(page_url)
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
if _content_has_comparable_image_fingerprint(content):
|
||||||
|
return [(page_url, "result_page_direct_image")]
|
||||||
|
image_urls = _extract_page_image_urls(content, page_url, limit)
|
||||||
|
if len(image_urls) < limit:
|
||||||
|
image_urls.extend(
|
||||||
|
self._search_result_stylesheet_image_urls(
|
||||||
|
content,
|
||||||
|
page_url,
|
||||||
|
limit - len(image_urls),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return [(image_url, "html_page_image") for image_url in _unique_texts(image_urls)[:limit]]
|
||||||
|
|
||||||
|
def _search_result_stylesheet_image_urls(
|
||||||
|
self,
|
||||||
|
page_content: bytes,
|
||||||
|
page_url: str,
|
||||||
|
limit: int,
|
||||||
|
) -> list[str]:
|
||||||
|
if limit <= 0:
|
||||||
|
return []
|
||||||
|
image_urls: list[str] = []
|
||||||
|
for stylesheet_url in _extract_page_stylesheet_urls(page_content, page_url, limit):
|
||||||
|
try:
|
||||||
|
stylesheet_content = self.stylesheet_fetcher(stylesheet_url)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
for image_url in _extract_css_image_urls(stylesheet_content, stylesheet_url, limit - len(image_urls)):
|
||||||
|
image_urls.append(image_url)
|
||||||
|
if len(image_urls) >= limit:
|
||||||
|
return image_urls
|
||||||
|
return image_urls
|
||||||
|
|
||||||
|
def _search_result_page_image_urls(self, source_evidence: Evidence) -> list[str]:
|
||||||
|
return [
|
||||||
|
image_url
|
||||||
|
for image_url, _candidate_source in self._search_result_page_image_candidates(source_evidence)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _increment_knowledge_contribution_counts(
|
||||||
|
self,
|
||||||
|
submission_id: str,
|
||||||
|
evidence: list[Evidence],
|
||||||
|
) -> None:
|
||||||
|
matched_entry_ids = _unique_texts(
|
||||||
|
str(item.data.get("knowledge_entry_id", ""))
|
||||||
|
for item in evidence
|
||||||
|
if item.source == EvidenceSource.FINGERPRINT
|
||||||
|
and item.data.get("knowledge_entry_status") == "watchlist"
|
||||||
|
and item.data.get("knowledge_entry_id")
|
||||||
|
)
|
||||||
|
for entry_id in matched_entry_ids:
|
||||||
|
try:
|
||||||
|
entry = self._get("knowledge_entries", entry_id)
|
||||||
|
except KeyError:
|
||||||
|
continue
|
||||||
|
if entry.get("entryStatus") != "watchlist":
|
||||||
|
continue
|
||||||
|
if str(entry.get("sourceSubmissionId", "")) == submission_id:
|
||||||
|
continue
|
||||||
|
matched_submission_ids = _text_list(entry.get("matchedSubmissionIds"))
|
||||||
|
if submission_id in matched_submission_ids:
|
||||||
|
continue
|
||||||
|
matched_submission_ids.append(submission_id)
|
||||||
|
entry["matchedSubmissionIds"] = matched_submission_ids
|
||||||
|
entry["contributionCount"] = int(entry.get("contributionCount", 0) or 0) + 1
|
||||||
|
entry["lastMatchedSubmissionId"] = submission_id
|
||||||
|
entry["lastMatchedAt"] = _now_label()
|
||||||
|
self._put("knowledge_entries", entry_id, entry)
|
||||||
|
|
||||||
|
def _store_manual_knowledge_image(
|
||||||
|
self,
|
||||||
|
entry_id: str,
|
||||||
|
image_payload: Any,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
if not image_payload:
|
||||||
|
return None
|
||||||
|
if not isinstance(image_payload, dict):
|
||||||
|
raise ValueError("knowledge image must be an object")
|
||||||
|
|
||||||
|
data = str(image_payload.get("data", ""))
|
||||||
|
if not data:
|
||||||
|
raise ValueError("knowledge image data required")
|
||||||
|
if "," in data and data.split(",", 1)[0].startswith("data:"):
|
||||||
|
data = data.split(",", 1)[1]
|
||||||
|
try:
|
||||||
|
content = base64.b64decode(data, validate=True)
|
||||||
|
except Exception as exc:
|
||||||
|
raise ValueError("knowledge image data must be base64") from exc
|
||||||
|
if not content:
|
||||||
|
raise ValueError("knowledge image is empty")
|
||||||
|
|
||||||
|
filename = str(image_payload.get("filename", "reference")).strip() or "reference"
|
||||||
|
suffix = _safe_image_suffix(filename, str(image_payload.get("content_type", "")))
|
||||||
|
safe_stem = _safe_filename(Path(filename).stem) or "reference"
|
||||||
|
target_name = f"{entry_id}-{safe_stem}{suffix}"
|
||||||
|
self.knowledge_image_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
root = self.knowledge_image_dir.resolve()
|
||||||
|
target = (root / target_name).resolve()
|
||||||
|
if target != root and root not in target.parents:
|
||||||
|
raise ValueError("knowledge image path points outside image store")
|
||||||
|
target.write_bytes(content)
|
||||||
|
|
||||||
|
width, height = _image_size_from_bytes(content)
|
||||||
|
fingerprints = FingerprintService().fingerprints_for(content)
|
||||||
|
return {
|
||||||
|
"asset": f"{self.knowledge_public_prefix}/{target_name}",
|
||||||
|
"perceptualFingerprint": fingerprints.perceptual,
|
||||||
|
"facts": {
|
||||||
|
"filename": filename,
|
||||||
|
"format": suffix.lstrip(".").upper(),
|
||||||
|
"size": f"{width} x {height}",
|
||||||
|
"fingerprints": 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _collection_candidates_from_evidence(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
evidence: list[Evidence],
|
||||||
|
provider: str,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
candidates: list[dict[str, Any]] = []
|
||||||
|
for item in evidence:
|
||||||
|
if item.source not in {EvidenceSource.NAVER_SEARCH, EvidenceSource.WEB_DETECTION}:
|
||||||
|
continue
|
||||||
|
if item.data.get("image_url"):
|
||||||
|
candidate = self._candidate_payload_from_evidence(
|
||||||
|
query,
|
||||||
|
item,
|
||||||
|
provider,
|
||||||
|
source_candidate_type="search_result_image",
|
||||||
|
)
|
||||||
|
if candidate is not None:
|
||||||
|
candidates.append(candidate)
|
||||||
|
continue
|
||||||
|
candidate_count = len(candidates)
|
||||||
|
for image_url in _unique_texts(item.data.get("page_image_urls", [])):
|
||||||
|
candidate = self._candidate_payload_from_evidence(
|
||||||
|
query,
|
||||||
|
item,
|
||||||
|
provider,
|
||||||
|
image_url=image_url,
|
||||||
|
thumbnail_url=image_url,
|
||||||
|
source_candidate_type="provider_page_image",
|
||||||
|
)
|
||||||
|
if candidate is not None:
|
||||||
|
candidates.append(candidate)
|
||||||
|
if len(candidates) > candidate_count:
|
||||||
|
continue
|
||||||
|
for image_url in _search_result_direct_image_urls(item):
|
||||||
|
candidate = self._candidate_payload_from_evidence(
|
||||||
|
query,
|
||||||
|
item,
|
||||||
|
provider,
|
||||||
|
image_url=image_url,
|
||||||
|
thumbnail_url=image_url,
|
||||||
|
source_candidate_type="result_page_direct_image",
|
||||||
|
)
|
||||||
|
if candidate is not None:
|
||||||
|
candidates.append(candidate)
|
||||||
|
if len(candidates) > candidate_count:
|
||||||
|
continue
|
||||||
|
for image_url, source_candidate_type in self._search_result_page_image_candidates(item):
|
||||||
|
candidate = self._candidate_payload_from_evidence(
|
||||||
|
query,
|
||||||
|
item,
|
||||||
|
provider,
|
||||||
|
image_url=image_url,
|
||||||
|
thumbnail_url=image_url,
|
||||||
|
source_candidate_type=source_candidate_type,
|
||||||
|
)
|
||||||
|
if candidate is not None:
|
||||||
|
candidates.append(candidate)
|
||||||
|
break
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
def _candidate_payload_from_evidence(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
evidence: Evidence,
|
||||||
|
provider: str = "naver",
|
||||||
|
image_url: str | None = None,
|
||||||
|
thumbnail_url: str | None = None,
|
||||||
|
source_candidate_type: str = "search_result_image",
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
image_url = _normalized_remote_image_url(
|
||||||
|
str(image_url if image_url is not None else evidence.data.get("image_url", ""))
|
||||||
|
)
|
||||||
|
thumbnail_url = _normalized_remote_image_url(
|
||||||
|
str(thumbnail_url if thumbnail_url is not None else evidence.data.get("thumbnail_url", ""))
|
||||||
|
)
|
||||||
|
result_url = str(evidence.data.get("result_url", ""))
|
||||||
|
candidate_id = _stable_id("cand", provider, source_candidate_type, query, image_url, thumbnail_url, result_url)
|
||||||
|
image_record = None
|
||||||
|
stored_image_url = ""
|
||||||
|
for candidate_url in _unique_texts([image_url, thumbnail_url]):
|
||||||
|
image_record = self._store_candidate_image(
|
||||||
|
candidate_id,
|
||||||
|
candidate_url,
|
||||||
|
referer_url=result_url,
|
||||||
|
)
|
||||||
|
if image_record is not None:
|
||||||
|
stored_image_url = candidate_url
|
||||||
|
break
|
||||||
|
if image_record is None:
|
||||||
|
return None
|
||||||
|
display_image_url = stored_image_url or image_url
|
||||||
|
return {
|
||||||
|
"id": candidate_id,
|
||||||
|
"provider": provider,
|
||||||
|
"query": query,
|
||||||
|
"title": _strip_html(str(evidence.data.get("title", ""))),
|
||||||
|
"status": "candidate",
|
||||||
|
"rank": evidence.data.get("rank", ""),
|
||||||
|
"imageUrl": display_image_url,
|
||||||
|
"thumbnailUrl": thumbnail_url,
|
||||||
|
"resultUrl": result_url,
|
||||||
|
"sourceUrl": result_url or display_image_url,
|
||||||
|
"sourceCandidateType": source_candidate_type,
|
||||||
|
"imageAsset": image_record["asset"],
|
||||||
|
"sampleFingerprints": [image_record["perceptualFingerprint"]],
|
||||||
|
"imageFacts": image_record["facts"],
|
||||||
|
"collectedAt": _now_label(),
|
||||||
|
"collectedEpoch": int(datetime.now().timestamp()),
|
||||||
|
"promotedKnowledgeId": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
def _store_candidate_image(
|
||||||
|
self,
|
||||||
|
candidate_id: str,
|
||||||
|
url: str,
|
||||||
|
referer_url: str = "",
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
suffix = _image_suffix_from_url(url)
|
||||||
|
target_name = f"{candidate_id}{suffix}"
|
||||||
|
root = self.collection_image_dir.resolve()
|
||||||
|
target = (root / target_name).resolve()
|
||||||
|
if target != root and root not in target.parents:
|
||||||
|
raise ValueError("candidate image path points outside image store")
|
||||||
|
if target.exists() and target.is_file():
|
||||||
|
try:
|
||||||
|
record = self._candidate_image_record_from_content(
|
||||||
|
target_name,
|
||||||
|
url,
|
||||||
|
suffix,
|
||||||
|
target.read_bytes(),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
record = None
|
||||||
|
if record is not None:
|
||||||
|
return record
|
||||||
|
try:
|
||||||
|
content = self._fetch_candidate_image_content(url, referer_url)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
image_record = self._candidate_image_record_from_content(
|
||||||
|
target_name,
|
||||||
|
url,
|
||||||
|
suffix,
|
||||||
|
content,
|
||||||
|
)
|
||||||
|
if image_record is None:
|
||||||
|
return None
|
||||||
|
self.collection_image_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
target.write_bytes(content)
|
||||||
|
return image_record
|
||||||
|
|
||||||
|
def _candidate_image_record_from_content(
|
||||||
|
self,
|
||||||
|
target_name: str,
|
||||||
|
url: str,
|
||||||
|
suffix: str,
|
||||||
|
content: bytes,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
if not content:
|
||||||
|
return None
|
||||||
|
width, height = _image_size_from_bytes(content)
|
||||||
|
fingerprints = FingerprintService().fingerprints_for(content)
|
||||||
|
if fingerprints.perceptual.startswith("phash:unavailable:"):
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"asset": f"{self.collection_public_prefix}/{target_name}",
|
||||||
|
"perceptualFingerprint": fingerprints.perceptual,
|
||||||
|
"facts": {
|
||||||
|
"source": url,
|
||||||
|
"format": suffix.lstrip(".").upper(),
|
||||||
|
"size": f"{width} x {height}",
|
||||||
|
"fingerprints": 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _fetch_candidate_image_content(self, url: str, referer_url: str = "") -> bytes:
|
||||||
|
if self._custom_candidate_image_fetcher is not None:
|
||||||
|
return self._custom_candidate_image_fetcher(url)
|
||||||
|
return _fetch_url_bytes(url, referer_url=referer_url)
|
||||||
|
|
||||||
|
def _rescore_submission(self, submission_id: str) -> None:
|
||||||
|
submission = self._get("submissions", submission_id)
|
||||||
|
evidence = [
|
||||||
|
_domain_evidence_from_ui(item)
|
||||||
|
for item in self._evidence_for_submission(submission_id)
|
||||||
|
]
|
||||||
|
score = RiskScorer().score(evidence)
|
||||||
|
submission["riskScore"] = score.score
|
||||||
|
submission["riskBand"] = score.band
|
||||||
|
submission["reasons"] = score.reasons or ["분석 근거 없음"]
|
||||||
|
self._put("submissions", submission_id, submission)
|
||||||
|
|
||||||
|
def _rescore_all_submissions(self, queue_id: str | None = None) -> None:
|
||||||
|
for submission in self._all("submissions", queue_id=queue_id):
|
||||||
|
self._rescore_submission(str(submission["id"]))
|
||||||
|
|
||||||
Loading…
Reference in a new issue