diff --git a/scripts/download_dataset.py b/scripts/download_dataset.py index 1adfff6..7846e86 100644 --- a/scripts/download_dataset.py +++ b/scripts/download_dataset.py @@ -1,4 +1,10 @@ -"""CLI: python scripts/download_dataset.py euroc_mh01""" +"""CLI: python scripts/download_dataset.py + +Registered datasets: see DATASET_REGISTRY in gps_denied.testing.download. +Entries with an empty URL are manual-download-only (e.g. EuRoC via ETH +Research Collection, VPAIR via a Zenodo form) — this script prints the +instructions and exits instead of guessing a download URL. +""" import sys from pathlib import Path @@ -14,8 +20,23 @@ def main(argv: list[str]) -> int: name = argv[1] if name not in DATASET_REGISTRY: print(f"unknown dataset: {name}", file=sys.stderr) + print(f"available: {', '.join(DATASET_REGISTRY)}", file=sys.stderr) return 2 spec = DATASET_REGISTRY[name] + if not spec.url: + print( + f"Dataset '{name}' has no automated download URL.", + file=sys.stderr, + ) + print( + "Fetch manually, then place the archive so its SHA256 matches:\n" + f" expected sha256: {spec.sha256}\n" + f" expected path subdir under datasets/: {spec.target_subdir}\n" + "See DATASET_REGISTRY comments in src/gps_denied/testing/download.py " + "for per-dataset instructions.", + file=sys.stderr, + ) + return 3 root = Path("datasets") / spec.target_subdir dest = root / Path(spec.url).name print(f"→ {dest}") diff --git a/src/gps_denied/testing/datasets/euroc.py b/src/gps_denied/testing/datasets/euroc.py index cd294b6..379fe30 100644 --- a/src/gps_denied/testing/datasets/euroc.py +++ b/src/gps_denied/testing/datasets/euroc.py @@ -33,7 +33,9 @@ class EuRoCAdapter(DatasetAdapter): raise DatasetNotAvailableError( f"EuRoC sequence not found at {self._root} " f"(expected {self._root}/mav0/). " - "Run `python scripts/download_dataset.py euroc_mh01` first." + "Fetch the Machine Hall bundle from ETH Research Collection " + "(DOI 10.3929/ethz-b-000690084), then unpack the inner " + "MH_0N_easy.zip of interest into this directory." ) @property diff --git a/src/gps_denied/testing/download.py b/src/gps_denied/testing/download.py index 1ad8467..d51e4ff 100644 --- a/src/gps_denied/testing/download.py +++ b/src/gps_denied/testing/download.py @@ -16,26 +16,23 @@ class DatasetSpec: unpack: bool = True -# NOTE ON SHA256 VALUES BELOW: -# The EuRoC MH_01 hash is a placeholder of 64 zeros. Before the first real -# download the engineer MUST: -# 1. Manually fetch the zip: -# curl -L -o /tmp/MH_01_easy.zip -# 2. Compute its hash: -# sha256sum /tmp/MH_01_easy.zip -# 3. Replace the placeholder here with the real hex string. -# 4. Commit the updated hash alongside the first real test run. -# Leaving a length-valid (64-char) placeholder keeps the registry well-formed -# and lets the download function refuse to keep any file that doesn't match, -# so the placeholder state fails loudly rather than silently accepting. +# REGISTRY NOTES: +# Entries with url="" are manual-download-only. The ETH Research Collection +# (DOI 10.3929/ethz-b-000690084) gates downloads behind a DSpace UI that +# doesn't expose a stable direct URL, so the bundle has to be fetched by +# hand and placed beside the project. SHA256 remains authoritative: the +# download helper refuses anything that doesn't match. DATASET_REGISTRY: dict[str, DatasetSpec] = { - "euroc_mh01": DatasetSpec( - url=( - "http://robotics.ethz.ch/~asl-datasets/ijrr_euroc_mav_dataset" - "/machine_hall/MH_01_easy/MH_01_easy.zip" - ), - sha256="0" * 64, # placeholder — see note above - target_subdir="euroc/MH_01", + "euroc_machine_hall": DatasetSpec( + # 12.6 GB bundle containing MH_01 … MH_05 (each as inner .zip + .bag). + # Download from ETH Research Collection: + # https://doi.org/10.3929/ethz-b-000690084 + # After fetching, unpack the inner MH_0N_easy.zip of interest into + # datasets/euroc/MH_0N/ so the adapter finds mav0/. + url="", # manual download — see comment above + sha256="5ed7d07903f8d19b6c8808e2ae8a0872b281f6e34ef5497023b8ac58c3de0f6f", + target_subdir="euroc", + unpack=False, # the bundle itself is not unpacked end-to-end; see README ), "vpair_sample": DatasetSpec( url="", # manual download only — see Zenodo link on diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 9a680fc..becc6ff 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -14,7 +14,9 @@ def euroc_mh01_root() -> Path: if not (root / "mav0").is_dir(): pytest.skip( f"EuRoC MH_01 not present at {root}. " - "Run `python scripts/download_dataset.py euroc_mh01` to fetch it." + "Fetch the Machine Hall bundle from ETH Research Collection " + "(DOI 10.3929/ethz-b-000690084), unpack the inner MH_01_easy.zip " + f"into {root}/ so that {root}/mav0/ exists." ) return root diff --git a/tests/e2e/test_download.py b/tests/e2e/test_download.py index 10b52d4..3c32d70 100644 --- a/tests/e2e/test_download.py +++ b/tests/e2e/test_download.py @@ -11,12 +11,15 @@ from gps_denied.testing.download import ( ) -def test_registry_has_euroc(): - assert "euroc_mh01" in DATASET_REGISTRY - spec = DATASET_REGISTRY["euroc_mh01"] +def test_registry_has_euroc_machine_hall(): + assert "euroc_machine_hall" in DATASET_REGISTRY + spec = DATASET_REGISTRY["euroc_machine_hall"] assert isinstance(spec, DatasetSpec) - assert spec.url.startswith("http") + # URL intentionally empty — ETH Research Collection gates downloads behind + # a DSpace UI without a stable direct URL. Registry records SHA256 only. + assert spec.url == "" assert len(spec.sha256) == 64 + assert spec.sha256 != "0" * 64 # real hash, not the placeholder def test_registry_has_vpair_sample():