fix: block SSRF to internal addresses in remote fetchers
Resolve each candidate image/page/stylesheet URL and refuse loopback, RFC1918, link-local (cloud-metadata), reserved, multicast, and unspecified targets before fetching; re-validate on every redirect hop via a custom opener. URLs originate from external search-result content, so this closes the operator server fetching internal services.
This commit is contained in:
parent
8958dd1b83
commit
62e2d183f8
2 changed files with 70 additions and 5 deletions
|
|
@ -3,11 +3,13 @@ from __future__ import annotations
|
||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import html
|
import html
|
||||||
|
import ipaddress
|
||||||
import json
|
import json
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
|
import socket
|
||||||
import sqlite3
|
import sqlite3
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
@ -17,7 +19,7 @@ from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable
|
from typing import Any, Callable
|
||||||
from urllib.parse import parse_qsl, unquote, urljoin, urlparse
|
from urllib.parse import parse_qsl, unquote, urljoin, urlparse
|
||||||
from urllib.request import Request, urlopen
|
from urllib.request import HTTPRedirectHandler, Request, build_opener
|
||||||
|
|
||||||
from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector
|
from rights_filter.analysis.face_person_detection import HeuristicFacePersonDetector
|
||||||
from rights_filter.analysis.fingerprints import FingerprintService
|
from rights_filter.analysis.fingerprints import FingerprintService
|
||||||
|
|
@ -3823,14 +3825,56 @@ def _fetch_stylesheet_url_bytes(url: str, timeout: int = 10, limit: int = CANDID
|
||||||
return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit)
|
return _fetch_url_bytes_with_headers(url, STYLESHEET_FETCH_HEADERS, timeout, limit)
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_public_http_url(url: str) -> None:
|
||||||
|
# SSRF guard: candidate image / page / stylesheet URLs come from external
|
||||||
|
# search-result content (attacker-influenceable), so refuse to fetch
|
||||||
|
# anything that resolves to a non-public address (loopback, RFC1918,
|
||||||
|
# link-local cloud-metadata 169.254.169.254, etc.).
|
||||||
|
parsed = urlparse(url)
|
||||||
|
if parsed.scheme not in {"http", "https"} or not parsed.hostname:
|
||||||
|
raise ValueError("only public http(s) URLs may be fetched")
|
||||||
|
port = parsed.port or (443 if parsed.scheme == "https" else 80)
|
||||||
|
try:
|
||||||
|
infos = socket.getaddrinfo(parsed.hostname, port, proto=socket.IPPROTO_TCP)
|
||||||
|
except OSError as exc:
|
||||||
|
raise ValueError("could not resolve fetch host") from exc
|
||||||
|
for *_, sockaddr in infos:
|
||||||
|
ip = ipaddress.ip_address(sockaddr[0])
|
||||||
|
if (
|
||||||
|
ip.is_private
|
||||||
|
or ip.is_loopback
|
||||||
|
or ip.is_link_local
|
||||||
|
or ip.is_reserved
|
||||||
|
or ip.is_multicast
|
||||||
|
or ip.is_unspecified
|
||||||
|
):
|
||||||
|
raise ValueError("refusing to fetch internal address")
|
||||||
|
|
||||||
|
|
||||||
|
class _ValidatingRedirectHandler(HTTPRedirectHandler):
|
||||||
|
# Re-run the SSRF guard on every redirect hop so a public URL cannot bounce
|
||||||
|
# to an internal address.
|
||||||
|
def redirect_request(self, req, fp, code, msg, headers, newurl):
|
||||||
|
_assert_public_http_url(newurl)
|
||||||
|
return super().redirect_request(req, fp, code, msg, headers, newurl)
|
||||||
|
|
||||||
|
|
||||||
|
_SSRF_SAFE_OPENER = build_opener(_ValidatingRedirectHandler())
|
||||||
|
|
||||||
|
|
||||||
|
def _open_url(request: Request, timeout: int):
|
||||||
|
return _SSRF_SAFE_OPENER.open(request, timeout=timeout)
|
||||||
|
|
||||||
|
|
||||||
def _fetch_url_bytes_with_headers(
|
def _fetch_url_bytes_with_headers(
|
||||||
url: str,
|
url: str,
|
||||||
headers: dict[str, str],
|
headers: dict[str, str],
|
||||||
timeout: int,
|
timeout: int,
|
||||||
limit: int,
|
limit: int,
|
||||||
) -> bytes:
|
) -> bytes:
|
||||||
|
_assert_public_http_url(url)
|
||||||
request = Request(url, headers=headers)
|
request = Request(url, headers=headers)
|
||||||
with urlopen(request, timeout=timeout) as response:
|
with _open_url(request, timeout) as response:
|
||||||
data = response.read(limit + 1)
|
data = response.read(limit + 1)
|
||||||
if len(data) > limit:
|
if len(data) > limit:
|
||||||
raise ValueError("candidate image exceeds size limit")
|
raise ValueError("candidate image exceeds size limit")
|
||||||
|
|
|
||||||
|
|
@ -322,7 +322,8 @@ def test_fetch_url_bytes_uses_browser_image_headers_and_room_for_real_photos(mon
|
||||||
captured["timeout"] = timeout
|
captured["timeout"] = timeout
|
||||||
return FakeResponse()
|
return FakeResponse()
|
||||||
|
|
||||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||||
|
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||||
|
|
||||||
content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp")
|
content = sqlite_store_module._fetch_url_bytes("https://cdn.example.test/profile.webp")
|
||||||
|
|
||||||
|
|
@ -3632,7 +3633,8 @@ def test_sqlite_store_uses_page_and_image_specific_headers_for_default_fetcher(
|
||||||
return FakeResponse(submitted_image)
|
return FakeResponse(submitted_image)
|
||||||
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
||||||
|
|
||||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||||
|
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||||
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
||||||
store.initialize()
|
store.initialize()
|
||||||
|
|
||||||
|
|
@ -3720,7 +3722,8 @@ def test_sqlite_store_uses_source_page_referer_for_default_page_image_fetch(
|
||||||
return FakeResponse(submitted_image)
|
return FakeResponse(submitted_image)
|
||||||
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
raise AssertionError(f"unexpected URL fetched: {request.full_url}")
|
||||||
|
|
||||||
monkeypatch.setattr(sqlite_store_module, "urlopen", fake_urlopen)
|
monkeypatch.setattr(sqlite_store_module, "_open_url", fake_urlopen)
|
||||||
|
monkeypatch.setattr(sqlite_store_module, "_assert_public_http_url", lambda url: None)
|
||||||
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
store = CopyrighterStore(tmp_path / "copyrighter.sqlite3", provider_runtime=runtime)
|
||||||
store.initialize()
|
store.initialize()
|
||||||
|
|
||||||
|
|
@ -8029,3 +8032,21 @@ def test_record_decision_rolls_back_when_audit_fails(tmp_path: Path, monkeypatch
|
||||||
assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed"
|
assert store._get("submissions", "SUB-1")["decisionStatus"] == "unreviewed"
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1"))
|
store._get("knowledge_entries", sqlite_store_module._stable_id("kb-watchlist", "SUB-1"))
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url",
|
||||||
|
[
|
||||||
|
"http://127.0.0.1/x",
|
||||||
|
"http://169.254.169.254/latest/meta-data",
|
||||||
|
"http://10.0.0.1/a",
|
||||||
|
"http://192.168.1.1/a",
|
||||||
|
"http://[::1]/a",
|
||||||
|
"http://localhost/a",
|
||||||
|
"ftp://example.test/a",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_fetch_rejects_internal_or_non_http_hosts(url):
|
||||||
|
# SSRF guard runs before any network call, so these raise without a fetch.
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
sqlite_store_module._fetch_url_bytes(url)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue