diff --git a/src/rights_filter/server/sqlite_store.py b/src/rights_filter/server/sqlite_store.py index da33143..c18123b 100644 --- a/src/rights_filter/server/sqlite_store.py +++ b/src/rights_filter/server/sqlite_store.py @@ -3,11 +3,13 @@ from __future__ import annotations import base64 import hashlib import html +import ipaddress import json import mimetypes import os import re import shutil +import socket import sqlite3 from html.parser import HTMLParser from contextlib import contextmanager @@ -17,7 +19,7 @@ from io import BytesIO from pathlib import Path from typing import Any, Callable from urllib.parse import parse_qsl, unquote, urljoin, urlparse -from urllib.request import Request, urlopen +from urllib.request import HTTPRedirectHandler, Request, build_opener from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector from rights_filter.analysis.fingerprints import FingerprintService @@ -3823,14 +3825,56 @@ def _fetch_stylesheet_url_bytes(url: str, timeout: int = 10, limit: int = CANDID return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit) +def _assert_public_http_url(url: str) -> None: + # SSRF guard: candidate image / page / stylesheet URLs come from external + # search-result content (attacker-influenceable), so refuse to fetch + # anything that resolves to a non-public address (loopback, RFC1918, + # link-local cloud-metadata 169.254.169.254, etc.). + parsed = urlparse(url) + if parsed.scheme not in {"http", "https"} or not parsed.hostname: + raise ValueError("only public http(s) URLs may be fetched") + port = parsed.port or (443 if parsed.scheme == "https" else 80) + try: + infos = socket.getaddrinfo(parsed.hostname, port, proto=socket.IPPROTO_TCP) + except OSError as exc: + raise ValueError("could not resolve fetch host") from exc + for *_, sockaddr in infos: + ip = ipaddress.ip_address(sockaddr[0]) + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + or ip.is_unspecified + ): + raise ValueError("refusing to fetch internal address") + + +class _ValidatingRedirectHandler(HTTPRedirectHandler): + # Re-run the SSRF guard on every redirect hop so a public URL cannot bounce + # to an internal address. + def redirect_request(self, req, fp, code, msg, headers, newurl): + _assert_public_http_url(newurl) + return super().redirect_request(req, fp, code, msg, headers, newurl) + + +_SSRF_SAFE_OPENER = build_opener(_ValidatingRedirectHandler()) + + +def _open_url(request: Request, timeout: int): + return _SSRF_SAFE_OPENER.open(request, timeout=timeout) + + def _fetch_url_bytes_with_headers( url: str, headers: dict[str, str], timeout: int, limit: int, ) -> bytes: + _assert_public_http_url(url) request = Request(url, headers=headers) - with urlopen(request, timeout=timeout) as response: + with _open_url(request, timeout) as response: data = response.read(limit + 1) if len(data) > limit: raise ValueError("candidate image exceeds size limit") diff --git a/tests/rights_filter/server/test_sqlite_store.py b/tests/rights_filter/server/test_sqlite_store.py index f5e016c..a00f062 100644 --- a/tests/rights_filter/server/test_sqlite_store.py +++ b/tests/rights_filter/server/test_sqlite_store.py @@ -322,7 +322,8 @@ def test_fetch_url_bytes_uses_browser_image_headers_and_room_for_real_photos(mon captured["timeout"] = timeout return FakeResponse() - monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None) content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp") @@ -3632,7 +3633,8 @@ def test_sqlite_store_uses_page_and_image_specific_headers_for_default_fetcher( return FakeResponse(submitted_image) raise AssertionError(f"unexpected URL fetched: {request.full_url}") - monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None) store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime) store.initialize() @@ -3720,7 +3722,8 @@ def test_sqlite_store_uses_source_page_referer_for_default_page_image_fetch( return FakeResponse(submitted_image) raise AssertionError(f"unexpected URL fetched: {request.full_url}") - monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen) + monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None) store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime) store.initialize() @@ -8029,3 +8032,21 @@ def test_record_decision_rolls_back_when_audit_fails(tmp_path: Path, monkeypatch assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed" with pytest.raises(KeyError): store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1")) + + +@pytest.mark.parametrize( + "url", + [ + "http://127.0.0.1/x", + "http://169.254.169.254/latest/meta-data", + "http://10.0.0.1/a", + "http://192.168.1.1/a", + "http://[::1]/a", + "http://localhost/a", + "ftp://example.test/a", + ], +) +def test_fetch_rejects_internal_or_non_http_hosts(url): + # SSRF guard runs before any network call, so these raise without a fetch. + with pytest.raises(ValueError): + sqlite_store_module._fetch_url_bytes(url)