fix: block SSRF to internal addresses in remote fetchers

Resolve each candidate image/page/stylesheet URL and refuse loopback,
RFC1918, link-local (cloud-metadata), reserved, multicast, and unspecified
targets before fetching; re-validate on every redirect hop via a custom
opener. URLs originate from external search-result content, so this closes
the operator server fetching internal services.
This commit is contained in:
유창욱 2026-06-20 18:22:10 +09:00
parent 8958dd1b83
commit 62e2d183f8
2 changed files with 70 additions and 5 deletions

View file

@ -3,11 +3,13 @@ from __future__ import annotations
import base64 import base64
import hashlib import hashlib
import html import html
import ipaddress
import json import json
import mimetypes import mimetypes
import os import os
import re import re
import shutil import shutil
import socket
import sqlite3 import sqlite3
from html.parser import HTMLParser from html.parser import HTMLParser
from contextlib import contextmanager from contextlib import contextmanager
@ -17,7 +19,7 @@ from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Any, Callable from typing import Any, Callable
from urllib.parse import parse_qsl, unquote, urljoin, urlparse from urllib.parse import parse_qsl, unquote, urljoin, urlparse
from urllib.request import Request, urlopen from urllib.request import HTTPRedirectHandler, Request, build_opener
from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector
from rights_filter.analysis.fingerprints import FingerprintService from rights_filter.analysis.fingerprints import FingerprintService
@ -3823,14 +3825,56 @@ def _fetch_stylesheet_url_bytes(url: str, timeout: int = 10, limit: int = CANDID
return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit) return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit)
def _assert_public_http_url(url: str) -> None:
# SSRF guard: candidate image / page / stylesheet URLs come from external
# search-result content (attacker-influenceable), so refuse to fetch
# anything that resolves to a non-public address (loopback, RFC1918,
# link-local cloud-metadata 169.254.169.254, etc.).
parsed = urlparse(url)
if parsed.scheme not in {"http", "https"} or not parsed.hostname:
raise ValueError("only public http(s) URLs may be fetched")
port = parsed.port or (443 if parsed.scheme == "https" else 80)
try:
infos = socket.getaddrinfo(parsed.hostname, port, proto=socket.IPPROTO_TCP)
except OSError as exc:
raise ValueError("could not resolve fetch host") from exc
for *_, sockaddr in infos:
ip = ipaddress.ip_address(sockaddr[0])
if (
ip.is_private
or ip.is_loopback
or ip.is_link_local
or ip.is_reserved
or ip.is_multicast
or ip.is_unspecified
):
raise ValueError("refusing to fetch internal address")
class _ValidatingRedirectHandler(HTTPRedirectHandler):
# Re-run the SSRF guard on every redirect hop so a public URL cannot bounce
# to an internal address.
def redirect_request(self, req, fp, code, msg, headers, newurl):
_assert_public_http_url(newurl)
return super().redirect_request(req, fp, code, msg, headers, newurl)
_SSRF_SAFE_OPENER = build_opener(_ValidatingRedirectHandler())
def _open_url(request: Request, timeout: int):
return _SSRF_SAFE_OPENER.open(request, timeout=timeout)
def _fetch_url_bytes_with_headers( def _fetch_url_bytes_with_headers(
url: str, url: str,
headers: dict[str, str], headers: dict[str, str],
timeout: int, timeout: int,
limit: int, limit: int,
) -> bytes: ) -> bytes:
_assert_public_http_url(url)
request = Request(url, headers=headers) request = Request(url, headers=headers)
with urlopen(request, timeout=timeout) as response: with _open_url(request, timeout) as response:
data = response.read(limit + 1) data = response.read(limit + 1)
if len(data) > limit: if len(data) > limit:
raise ValueError("candidate image exceeds size limit") raise ValueError("candidate image exceeds size limit")

View file

@ -322,7 +322,8 @@ def test_fetch_url_bytes_uses_browser_image_headers_and_room_for_real_photos(mon
captured["timeout"] = timeout captured["timeout"] = timeout
return FakeResponse() return FakeResponse()
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp") content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp")
@ -3632,7 +3633,8 @@ def test_sqlite_store_uses_page_and_image_specific_headers_for_default_fetcher(
return FakeResponse(submitted_image) return FakeResponse(submitted_image)
raise AssertionError(f"unexpected URL fetched: {request.full_url}") raise AssertionError(f"unexpected URL fetched: {request.full_url}")
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime) store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
store.initialize() store.initialize()
@ -3720,7 +3722,8 @@ def test_sqlite_store_uses_source_page_referer_for_default_page_image_fetch(
return FakeResponse(submitted_image) return FakeResponse(submitted_image)
raise AssertionError(f"unexpected URL fetched: {request.full_url}") raise AssertionError(f"unexpected URL fetched: {request.full_url}")
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime) store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
store.initialize() store.initialize()
@ -8029,3 +8032,21 @@ def test_record_decision_rolls_back_when_audit_fails(tmp_path: Path, monkeypatch
assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed" assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed"
with pytest.raises(KeyError): with pytest.raises(KeyError):
store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1")) store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1"))
@pytest.mark.parametrize(
"url",
[
"http://127.0.0.1/x",
"http://169.254.169.254/latest/meta-data",
"http://10.0.0.1/a",
"http://192.168.1.1/a",
"http://[::1]/a",
"http://localhost/a",
"ftp://example.test/a",
],
)
def test_fetch_rejects_internal_or_non_http_hosts(url):
# SSRF guard runs before any network call, so these raise without a fetch.
with pytest.raises(ValueError):
sqlite_store_module._fetch_url_bytes(url)