fix: block SSRF to internal addresses in remote fetchers
Resolve each candidate image/page/stylesheet URL and refuse loopback, RFC1918, link-local (cloud-metadata), reserved, multicast, and unspecified targets before fetching; re-validate on every redirect hop via a custom opener. URLs originate from external search-result content, so this closes the operator server fetching internal services.
This commit is contained in:
parent
8958dd1b83
commit
62e2d183f8
2 changed files with 70 additions and 5 deletions
|
|
@ -3,11 +3,13 @@ from __future__ import annotations
|
|||
import base64
|
||||
import hashlib
|
||||
import html
|
||||
import ipaddress
|
||||
import json
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import socket
|
||||
import sqlite3
|
||||
from html.parser import HTMLParser
|
||||
from contextlib import contextmanager
|
||||
|
|
@ -17,7 +19,7 @@ from io import BytesIO
|
|||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
from urllib.parse import parse_qsl, unquote, urljoin, urlparse
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.request import HTTPRedirectHandler, Request, build_opener
|
||||
|
||||
from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector
|
||||
from rights_filter.analysis.fingerprints import FingerprintService
|
||||
|
|
@ -3823,14 +3825,56 @@ def _fetch_stylesheet_url_bytes(url: str, timeout: int = 10, limit: int = CANDID
|
|||
return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit)
|
||||
|
||||
|
||||
def _assert_public_http_url(url: str) -> None:
|
||||
# SSRF guard: candidate image / page / stylesheet URLs come from external
|
||||
# search-result content (attacker-influenceable), so refuse to fetch
|
||||
# anything that resolves to a non-public address (loopback, RFC1918,
|
||||
# link-local cloud-metadata 169.254.169.254, etc.).
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme not in {"http", "https"} or not parsed.hostname:
|
||||
raise ValueError("only public http(s) URLs may be fetched")
|
||||
port = parsed.port or (443 if parsed.scheme == "https" else 80)
|
||||
try:
|
||||
infos = socket.getaddrinfo(parsed.hostname, port, proto=socket.IPPROTO_TCP)
|
||||
except OSError as exc:
|
||||
raise ValueError("could not resolve fetch host") from exc
|
||||
for *_, sockaddr in infos:
|
||||
ip = ipaddress.ip_address(sockaddr[0])
|
||||
if (
|
||||
ip.is_private
|
||||
or ip.is_loopback
|
||||
or ip.is_link_local
|
||||
or ip.is_reserved
|
||||
or ip.is_multicast
|
||||
or ip.is_unspecified
|
||||
):
|
||||
raise ValueError("refusing to fetch internal address")
|
||||
|
||||
|
||||
class _ValidatingRedirectHandler(HTTPRedirectHandler):
|
||||
# Re-run the SSRF guard on every redirect hop so a public URL cannot bounce
|
||||
# to an internal address.
|
||||
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
||||
_assert_public_http_url(newurl)
|
||||
return super().redirect_request(req, fp, code, msg, headers, newurl)
|
||||
|
||||
|
||||
_SSRF_SAFE_OPENER = build_opener(_ValidatingRedirectHandler())
|
||||
|
||||
|
||||
def _open_url(request: Request, timeout: int):
|
||||
return _SSRF_SAFE_OPENER.open(request, timeout=timeout)
|
||||
|
||||
|
||||
def _fetch_url_bytes_with_headers(
|
||||
url: str,
|
||||
headers: dict[str, str],
|
||||
timeout: int,
|
||||
limit: int,
|
||||
) -> bytes:
|
||||
_assert_public_http_url(url)
|
||||
request = Request(url, headers=headers)
|
||||
with urlopen(request, timeout=timeout) as response:
|
||||
with _open_url(request, timeout) as response:
|
||||
data = response.read(limit + 1)
|
||||
if len(data) > limit:
|
||||
raise ValueError("candidate image exceeds size limit")
|
||||
|
|
|
|||
|
|
@ -322,7 +322,8 @@ def test_fetch_url_bytes_uses_browser_image_headers_and_room_for_real_photos(mon
|
|||
captured["timeout"] = timeout
|
||||
return FakeResponse()
|
||||
|
||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||
|
||||
content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp")
|
||||
|
||||
|
|
@ -3632,7 +3633,8 @@ def test_sqlite_store_uses_page_and_image_specific_headers_for_default_fetcher(
|
|||
return FakeResponse(submitted_image)
|
||||
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
||||
|
||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
||||
store.initialize()
|
||||
|
||||
|
|
@ -3720,7 +3722,8 @@ def test_sqlite_store_uses_source_page_referer_for_default_page_image_fetch(
|
|||
return FakeResponse(submitted_image)
|
||||
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
||||
|
||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
||||
store.initialize()
|
||||
|
||||
|
|
@ -8029,3 +8032,21 @@ def test_record_decision_rolls_back_when_audit_fails(tmp_path: Path, monkeypatch
|
|||
assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed"
|
||||
with pytest.raises(KeyError):
|
||||
store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1"))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"url",
|
||||
[
|
||||
"http://127.0.0.1/x",
|
||||
"http://169.254.169.254/latest/meta-data",
|
||||
"http://10.0.0.1/a",
|
||||
"http://192.168.1.1/a",
|
||||
"http://[::1]/a",
|
||||
"http://localhost/a",
|
||||
"ftp://example.test/a",
|
||||
],
|
||||
)
|
||||
def test_fetch_rejects_internal_or_non_http_hosts(url):
|
||||
# SSRF guard runs before any network call, so these raise without a fetch.
|
||||
with pytest.raises(ValueError):
|
||||
sqlite_store_module._fetch_url_bytes(url)
|
||||
|
|
|
|||
Loading…
Reference in a new issue