test(e2e): rename registry entry to euroc_machine_hall with real SHA256

The prior registry entry was speculative: ``euroc_mh01`` pointing at an
old ``robotics.ethz.ch`` URL that no longer resolves (TCP timeout).
The dataset moved to ETH Research Collection (DOI 10.3929/ethz-b-000690084)
as a single 12.6 GB ``machine_hall.zip`` bundle containing MH_01…MH_05.
There's no stable direct download URL — DSpace gates behind a UI —
so:

- Renamed entry: ``euroc_mh01`` → ``euroc_machine_hall`` (matches the
  actual artifact).
- SHA256 set to the real bundle hash 5ed7d07…
- URL left empty (same pattern as ``vpair_sample``); the CLI now
  exits 3 and prints fetch instructions for empty-URL entries instead
  of crashing on ``urllib.request.urlretrieve("")``.
- Adapter ``DatasetNotAvailableError`` message and conftest skip-reason
  updated to tell engineers how to fetch/unpack manually.
- ``test_registry_has_euroc_machine_hall`` pin test replaces the old
  pin; asserts real hash (not the ``"0"*64`` placeholder).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Yuzviak
2026-04-17 17:48:01 +03:00
committed by Maksym Yuzviak
parent f2f278bc09
commit b57187e1b8
5 changed files with 51 additions and 26 deletions
+22 -1
View File
@@ -1,4 +1,10 @@
"""CLI: python scripts/download_dataset.py euroc_mh01"""
"""CLI: python scripts/download_dataset.py <dataset_name>
Registered datasets: see DATASET_REGISTRY in gps_denied.testing.download.
Entries with an empty URL are manual-download-only (e.g. EuRoC via ETH
Research Collection, VPAIR via a Zenodo form) — this script prints the
instructions and exits instead of guessing a download URL.
"""
import sys
from pathlib import Path
@@ -14,8 +20,23 @@ def main(argv: list[str]) -> int:
name = argv[1]
if name not in DATASET_REGISTRY:
print(f"unknown dataset: {name}", file=sys.stderr)
print(f"available: {', '.join(DATASET_REGISTRY)}", file=sys.stderr)
return 2
spec = DATASET_REGISTRY[name]
if not spec.url:
print(
f"Dataset '{name}' has no automated download URL.",
file=sys.stderr,
)
print(
"Fetch manually, then place the archive so its SHA256 matches:\n"
f" expected sha256: {spec.sha256}\n"
f" expected path subdir under datasets/: {spec.target_subdir}\n"
"See DATASET_REGISTRY comments in src/gps_denied/testing/download.py "
"for per-dataset instructions.",
file=sys.stderr,
)
return 3
root = Path("datasets") / spec.target_subdir
dest = root / Path(spec.url).name
print(f"{dest}")
+3 -1
View File
@@ -33,7 +33,9 @@ class EuRoCAdapter(DatasetAdapter):
raise DatasetNotAvailableError(
f"EuRoC sequence not found at {self._root} "
f"(expected {self._root}/mav0/). "
"Run `python scripts/download_dataset.py euroc_mh01` first."
"Fetch the Machine Hall bundle from ETH Research Collection "
"(DOI 10.3929/ethz-b-000690084), then unpack the inner "
"MH_0N_easy.zip of interest into this directory."
)
@property
+16 -19
View File
@@ -16,26 +16,23 @@ class DatasetSpec:
unpack: bool = True
# NOTE ON SHA256 VALUES BELOW:
# The EuRoC MH_01 hash is a placeholder of 64 zeros. Before the first real
# download the engineer MUST:
# 1. Manually fetch the zip:
# curl -L <url from spec> -o /tmp/MH_01_easy.zip
# 2. Compute its hash:
# sha256sum /tmp/MH_01_easy.zip
# 3. Replace the placeholder here with the real hex string.
# 4. Commit the updated hash alongside the first real test run.
# Leaving a length-valid (64-char) placeholder keeps the registry well-formed
# and lets the download function refuse to keep any file that doesn't match,
# so the placeholder state fails loudly rather than silently accepting.
# REGISTRY NOTES:
# Entries with url="" are manual-download-only. The ETH Research Collection
# (DOI 10.3929/ethz-b-000690084) gates downloads behind a DSpace UI that
# doesn't expose a stable direct URL, so the bundle has to be fetched by
# hand and placed beside the project. SHA256 remains authoritative: the
# download helper refuses anything that doesn't match.
DATASET_REGISTRY: dict[str, DatasetSpec] = {
"euroc_mh01": DatasetSpec(
url=(
"http://robotics.ethz.ch/~asl-datasets/ijrr_euroc_mav_dataset"
"/machine_hall/MH_01_easy/MH_01_easy.zip"
),
sha256="0" * 64, # placeholder — see note above
target_subdir="euroc/MH_01",
"euroc_machine_hall": DatasetSpec(
# 12.6 GB bundle containing MH_01 … MH_05 (each as inner .zip + .bag).
# Download from ETH Research Collection:
# https://doi.org/10.3929/ethz-b-000690084
# After fetching, unpack the inner MH_0N_easy.zip of interest into
# datasets/euroc/MH_0N/ so the adapter finds mav0/.
url="", # manual download — see comment above
sha256="5ed7d07903f8d19b6c8808e2ae8a0872b281f6e34ef5497023b8ac58c3de0f6f",
target_subdir="euroc",
unpack=False, # the bundle itself is not unpacked end-to-end; see README
),
"vpair_sample": DatasetSpec(
url="", # manual download only — see Zenodo link on
+3 -1
View File
@@ -14,7 +14,9 @@ def euroc_mh01_root() -> Path:
if not (root / "mav0").is_dir():
pytest.skip(
f"EuRoC MH_01 not present at {root}. "
"Run `python scripts/download_dataset.py euroc_mh01` to fetch it."
"Fetch the Machine Hall bundle from ETH Research Collection "
"(DOI 10.3929/ethz-b-000690084), unpack the inner MH_01_easy.zip "
f"into {root}/ so that {root}/mav0/ exists."
)
return root
+7 -4
View File
@@ -11,12 +11,15 @@ from gps_denied.testing.download import (
)
def test_registry_has_euroc():
assert "euroc_mh01" in DATASET_REGISTRY
spec = DATASET_REGISTRY["euroc_mh01"]
def test_registry_has_euroc_machine_hall():
assert "euroc_machine_hall" in DATASET_REGISTRY
spec = DATASET_REGISTRY["euroc_machine_hall"]
assert isinstance(spec, DatasetSpec)
assert spec.url.startswith("http")
# URL intentionally empty — ETH Research Collection gates downloads behind
# a DSpace UI without a stable direct URL. Registry records SHA256 only.
assert spec.url == ""
assert len(spec.sha256) == 64
assert spec.sha256 != "0" * 64 # real hash, not the placeholder
def test_registry_has_vpair_sample():