test(e2e): add SHA256-verified dataset downloader + EuRoC registry entry

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yuzviak
2026-04-16 21:51:06 +03:00
parent 95accb8f7a
commit 669d8e5653
3 changed files with 152 additions and 0 deletions
+28
View File
@@ -0,0 +1,28 @@
"""CLI: python scripts/download_dataset.py euroc_mh01"""
import sys
from pathlib import Path
from gps_denied.testing.download import DATASET_REGISTRY, download_dataset
def main(argv: list[str]) -> int:
if len(argv) != 2:
print(f"usage: {argv[0]} <dataset_name>", file=sys.stderr)
print(f"available: {', '.join(DATASET_REGISTRY)}", file=sys.stderr)
return 2
name = argv[1]
if name not in DATASET_REGISTRY:
print(f"unknown dataset: {name}", file=sys.stderr)
return 2
spec = DATASET_REGISTRY[name]
root = Path("datasets") / spec.target_subdir
dest = root / Path(spec.url).name
print(f"{dest}")
download_dataset(spec, dest)
print("OK")
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))
+68
View File
@@ -0,0 +1,68 @@
"""Dataset downloader — registry + SHA256-verified fetch."""
from __future__ import annotations
import hashlib
import urllib.request
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DatasetSpec:
url: str
sha256: str
target_subdir: str
unpack: bool = True
# NOTE ON SHA256 VALUES BELOW:
# The EuRoC MH_01 hash is a placeholder of 64 zeros. Before the first real
# download the engineer MUST:
# 1. Manually fetch the zip:
# curl -L <url from spec> -o /tmp/MH_01_easy.zip
# 2. Compute its hash:
# sha256sum /tmp/MH_01_easy.zip
# 3. Replace the placeholder here with the real hex string.
# 4. Commit the updated hash alongside the first real test run.
# Leaving a length-valid (64-char) placeholder keeps the registry well-formed
# and lets the download function refuse to keep any file that doesn't match,
# so the placeholder state fails loudly rather than silently accepting.
DATASET_REGISTRY: dict[str, DatasetSpec] = {
"euroc_mh01": DatasetSpec(
url=(
"http://robotics.ethz.ch/~asl-datasets/ijrr_euroc_mav_dataset"
"/machine_hall/MH_01_easy/MH_01_easy.zip"
),
sha256="0" * 64, # placeholder — see note above
target_subdir="euroc/MH_01",
),
}
def verify_sha256(path: Path, expected_hex: str) -> bool:
"""Return True iff SHA256 of `path` hex-matches `expected_hex`."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest() == expected_hex
def download_dataset(spec: DatasetSpec, dest: Path) -> Path:
"""Download `spec.url` to `dest` if not already present and valid.
If `dest` already exists and its SHA256 matches `spec.sha256`, return
immediately without touching the network. Otherwise download, verify,
and raise RuntimeError on hash mismatch (deleting the bad file).
"""
if dest.exists() and verify_sha256(dest, spec.sha256):
return dest
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(dest.suffix + ".part")
urllib.request.urlretrieve(spec.url, tmp)
tmp.rename(dest)
if not verify_sha256(dest, spec.sha256):
dest.unlink()
raise RuntimeError(f"SHA256 mismatch after download of {spec.url}")
return dest
+56
View File
@@ -0,0 +1,56 @@
"""Dataset downloader — URL registry + SHA256 verification."""
import hashlib
from pathlib import Path
import pytest
from gps_denied.testing.download import (
DATASET_REGISTRY,
DatasetSpec,
verify_sha256,
download_dataset,
)
def test_registry_has_euroc():
assert "euroc_mh01" in DATASET_REGISTRY
spec = DATASET_REGISTRY["euroc_mh01"]
assert isinstance(spec, DatasetSpec)
assert spec.url.startswith("http")
assert len(spec.sha256) == 64
def test_verify_sha256_matches(tmp_path: Path):
data = b"hello world"
f = tmp_path / "x.bin"
f.write_bytes(data)
expected = hashlib.sha256(data).hexdigest()
assert verify_sha256(f, expected) is True
def test_verify_sha256_mismatch(tmp_path: Path):
f = tmp_path / "x.bin"
f.write_bytes(b"hello world")
assert verify_sha256(f, "0" * 64) is False
def test_download_skip_if_present(tmp_path: Path, monkeypatch):
f = tmp_path / "cached.zip"
f.write_bytes(b"cached")
spec = DatasetSpec(
url="http://example.invalid/cached.zip",
sha256=hashlib.sha256(b"cached").hexdigest(),
target_subdir="cached",
)
# Should return the path without hitting the network
called = {"n": 0}
def fake_get(*args, **kwargs):
called["n"] += 1
raise AssertionError("download should have been skipped")
monkeypatch.setattr("urllib.request.urlretrieve", fake_get)
result = download_dataset(spec, f)
assert result == f
assert called["n"] == 0