test(e2e): add SHA256-verified dataset downloader + EuRoC registry entry

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yuzviak
2026-04-16 21:51:06 +03:00
parent 95accb8f7a
commit 669d8e5653
3 changed files with 152 additions and 0 deletions
+68
View File
@@ -0,0 +1,68 @@
"""Dataset downloader — registry + SHA256-verified fetch."""
from __future__ import annotations
import hashlib
import urllib.request
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class DatasetSpec:
url: str
sha256: str
target_subdir: str
unpack: bool = True
# NOTE ON SHA256 VALUES BELOW:
# The EuRoC MH_01 hash is a placeholder of 64 zeros. Before the first real
# download the engineer MUST:
# 1. Manually fetch the zip:
# curl -L <url from spec> -o /tmp/MH_01_easy.zip
# 2. Compute its hash:
# sha256sum /tmp/MH_01_easy.zip
# 3. Replace the placeholder here with the real hex string.
# 4. Commit the updated hash alongside the first real test run.
# Leaving a length-valid (64-char) placeholder keeps the registry well-formed
# and lets the download function refuse to keep any file that doesn't match,
# so the placeholder state fails loudly rather than silently accepting.
DATASET_REGISTRY: dict[str, DatasetSpec] = {
"euroc_mh01": DatasetSpec(
url=(
"http://robotics.ethz.ch/~asl-datasets/ijrr_euroc_mav_dataset"
"/machine_hall/MH_01_easy/MH_01_easy.zip"
),
sha256="0" * 64, # placeholder — see note above
target_subdir="euroc/MH_01",
),
}
def verify_sha256(path: Path, expected_hex: str) -> bool:
"""Return True iff SHA256 of `path` hex-matches `expected_hex`."""
h = hashlib.sha256()
with path.open("rb") as fh:
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest() == expected_hex
def download_dataset(spec: DatasetSpec, dest: Path) -> Path:
"""Download `spec.url` to `dest` if not already present and valid.
If `dest` already exists and its SHA256 matches `spec.sha256`, return
immediately without touching the network. Otherwise download, verify,
and raise RuntimeError on hash mismatch (deleting the bad file).
"""
if dest.exists() and verify_sha256(dest, spec.sha256):
return dest
dest.parent.mkdir(parents=True, exist_ok=True)
tmp = dest.with_suffix(dest.suffix + ".part")
urllib.request.urlretrieve(spec.url, tmp)
tmp.rename(dest)
if not verify_sha256(dest, spec.sha256):
dest.unlink()
raise RuntimeError(f"SHA256 mismatch after download of {spec.url}")
return dest