mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 05:41:13 +00:00
97f5f9793c
AZ-965 ships the NetVLAD .pt checkpoint that clears the AZ-839
empty-c10_provisioning.backbones SKIP gate. Pipeline-integration
scaffold — encoder is real, NetVLAD tail is honestly labelled as
untrained.
Composition:
* Encoder (26 keys, encoder.0..encoder.28): torchvision
vgg16(weights=IMAGENET1K_V1) features [:-2], BSD-3-Clause.
Real ImageNet-pretrained VGG16 conv stack.
* NetVLAD pool + PCA tail (5 keys: pool.conv.{weight,bias},
pool.centroids, pca.{weight,bias}): random-init via
torch.manual_seed(0). NOT trained for visual place recognition.
Total: 149,002,112 params (568.4 MiB fp32, sha256=745c6f29...).
Round-trip verified locally: torch.load(weights_only=True) +
load_state_dict(strict=True) succeed; forward(1,3,480,480) emits
{'vlad_descriptor': (1, 4096) fp32} — matches NetVladStrategy
contract per net_vlad.py:247-251.
Two material discoveries documented in the AZ-965 spec:
1. The NetVLAD-VGG16 architecture already lives in repo at
src/gps_denied_onboard/components/c2_vpr/_net_vlad_architecture.py
— we instantiate it and save a state_dict, NOT externally source.
2. The PyTorch FP16 runtime expects a .pt state_dict (NOT .onnx).
BackboneConfig.onnx_path is a misnomer for NetVLAD: per AZ-321
design + c2_vpr description.md §1, NetVLAD runs on PyTorch FP16
(NOT TRT). compile_engine is a no-op sha256+path wrap;
deserialize_engine does torch.load(weights_only=True) +
load_state_dict(strict=True).
User skipped Option A/B/C/D/E question — judgment call = Option B
(IMAGENET1K_V1 + random tail) per "use judgment, don't block":
* Option A (Nanne translation) was 5-8 SP, above the 5 SP budget.
* Option B is 3 SP, fits the budget, honestly labelled.
* Option C (pure random) was borderline-dishonest per Real Results.
Files:
* scripts/mk_netvlad_checkpoint.py — deterministic generator.
* models/netvlad/netvlad.pt — 568 MiB, via git-lfs (.gitattributes
extended for models/**/*.pt, *.onnx, *.engine).
* configs/operator_replay.yaml — c2_vpr + c10_provisioning blocks
populated; the field literally named onnx_path actually points
at the .pt for NetVLAD per the runtime semantics noted above.
* docker-compose.test.jetson.yml — ./models:/opt/models:ro bind
mount added to e2e-runner.
* _docs/03_ip_attribution/netvlad.md — provenance, licence, how-to-
reproduce, honest scope statement ("NOT a real-retrieval
checkpoint; ESKF divergence under garbage retrievals is the
expected next gate").
* _docs/02_tasks/todo/AZ-965_netvlad_onnx_backbone_provisioning.md
— rewritten to reflect the .pt-not-.onnx + Option B discoveries.
Tier-2 verification follows in a separate commit after the harness
run confirms the empty-backbones SKIP gate clears.
Out of scope (filed as follow-ups):
* Real-retrieval NetVLAD weights (Nanne Pittsburgh-30k translation
or internal team checkpoint) — separate ticket.
* AZ-840 orchestrator PASSing end-to-end (depends on retrieval
quality + ESKF stability).
* AZ-963 60s smoke ESKF divergence (independent chain).
Co-authored-by: Cursor <cursoragent@cursor.com>
161 lines
5.4 KiB
Python
161 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""AZ-965 — generate a NetVLAD-VGG16 PyTorch state_dict checkpoint.
|
|
|
|
Pipeline-integration checkpoint for the AZ-839 / AZ-840 e2e fixture.
|
|
Composition:
|
|
|
|
* **Encoder**: ``torchvision.models.vgg16(weights="IMAGENET1K_V1")``
|
|
features (BSD-licensed public weights). Layers ``[:-2]`` are loaded
|
|
into the project's ``_NetVladVgg16.encoder`` slot.
|
|
* **NetVLAD pool**: ``pool.conv`` + ``pool.centroids`` are initialised
|
|
deterministically from ``torch.manual_seed(0)`` — UNTRAINED for
|
|
retrieval; the architecture-default constructor's distribution is
|
|
what we ship.
|
|
* **PCA**: ``pca.weight`` + ``pca.bias`` likewise random-init via the
|
|
architecture-default constructor — UNTRAINED.
|
|
|
|
Honest scope:
|
|
|
|
* The encoder produces real ImageNet-pretrained features and is a
|
|
legitimate ImageNet-trained VGG16 backbone.
|
|
* The NetVLAD pool + PCA tail are NOT trained for retrieval. The
|
|
resulting embeddings are essentially random projections of VGG16
|
|
features. The c10 compile + c2 strategy will instantiate and run,
|
|
but retrieval results will be effectively random.
|
|
* This unblocks the AZ-840 orchestrator's empty-backbones SKIP gate
|
|
so the next gate (likely ESKF divergence under garbage retrievals)
|
|
can surface as a separate, named failure for follow-up work.
|
|
|
|
Reproduce: ``python scripts/mk_netvlad_checkpoint.py``.
|
|
|
|
License: torchvision weights are BSD-3-Clause; this script and the
|
|
generated random NetVLAD tail are project-owned. Full provenance in
|
|
``_docs/03_ip_attribution/netvlad.md``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import torchvision
|
|
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
if str(_REPO_ROOT / "src") not in sys.path:
|
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
|
|
|
from gps_denied_onboard.components.c2_vpr._net_vlad_architecture import ( # noqa: E402
|
|
DEFAULT_DESCRIPTOR_DIM,
|
|
DEFAULT_ENCODER_DIM,
|
|
DEFAULT_NUM_CLUSTERS,
|
|
make_net_vlad_vgg16,
|
|
)
|
|
|
|
|
|
_DEFAULT_OUTPUT = _REPO_ROOT / "models" / "netvlad" / "netvlad.pt"
|
|
_SEED = 0
|
|
|
|
|
|
def _parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=_DEFAULT_OUTPUT,
|
|
help=f"Output .pt path (default: {_DEFAULT_OUTPUT})",
|
|
)
|
|
parser.add_argument(
|
|
"--num-clusters",
|
|
type=int,
|
|
default=DEFAULT_NUM_CLUSTERS,
|
|
)
|
|
parser.add_argument(
|
|
"--encoder-dim",
|
|
type=int,
|
|
default=DEFAULT_ENCODER_DIM,
|
|
)
|
|
parser.add_argument(
|
|
"--descriptor-dim",
|
|
type=int,
|
|
default=DEFAULT_DESCRIPTOR_DIM,
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def _load_imagenet_vgg16_features_state(encoder_dim: int) -> dict[str, torch.Tensor]:
|
|
"""Return state_dict slice for the project's encoder slot.
|
|
|
|
The project's ``_NetVladVgg16.encoder`` is
|
|
``nn.Sequential(*list(vgg.features.children())[:-2])`` —
|
|
everything in ``torchvision.models.vgg16().features`` except the
|
|
last two layers (the trailing ReLU + MaxPool2d). We load the
|
|
full ``vgg16(weights="IMAGENET1K_V1")``, take its ``.features``,
|
|
pass through the same slicing, and prefix the state_dict keys
|
|
with ``encoder.``.
|
|
"""
|
|
vgg = torchvision.models.vgg16(weights="IMAGENET1K_V1")
|
|
encoder_features = torch.nn.Sequential(*list(vgg.features.children())[:-2])
|
|
out: dict[str, torch.Tensor] = {}
|
|
for key, value in encoder_features.state_dict().items():
|
|
out[f"encoder.{key}"] = value.detach().clone()
|
|
if encoder_dim != 512:
|
|
raise SystemExit(
|
|
f"Only encoder_dim=512 is supported (VGG16 conv5_3 produces "
|
|
f"512 channels); got {encoder_dim}"
|
|
)
|
|
return out
|
|
|
|
|
|
def main() -> int:
|
|
args = _parse_args()
|
|
torch.manual_seed(_SEED)
|
|
model = make_net_vlad_vgg16(
|
|
num_clusters=args.num_clusters,
|
|
encoder_dim=args.encoder_dim,
|
|
descriptor_dim=args.descriptor_dim,
|
|
)
|
|
full_state = model.state_dict()
|
|
imagenet_encoder = _load_imagenet_vgg16_features_state(args.encoder_dim)
|
|
missing = [k for k in imagenet_encoder if k not in full_state]
|
|
if missing:
|
|
raise SystemExit(
|
|
f"Encoder-key mismatch — torchvision VGG16 produced keys not "
|
|
f"present in project arch: {missing[:5]}..."
|
|
)
|
|
for key, tensor in imagenet_encoder.items():
|
|
target = full_state[key]
|
|
if tensor.shape != target.shape:
|
|
raise SystemExit(
|
|
f"Encoder shape mismatch at {key}: torchvision="
|
|
f"{tuple(tensor.shape)} project={tuple(target.shape)}"
|
|
)
|
|
full_state[key] = tensor
|
|
model.load_state_dict(full_state, strict=True)
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
torch.save(full_state, args.output)
|
|
blob = args.output.read_bytes()
|
|
sha256 = hashlib.sha256(blob).hexdigest()
|
|
print(
|
|
f"[mk_netvlad_checkpoint] wrote {args.output} "
|
|
f"size={len(blob) / (1024 * 1024):.1f} MiB sha256={sha256}"
|
|
)
|
|
print(
|
|
f" num_clusters={args.num_clusters} encoder_dim={args.encoder_dim} "
|
|
f"descriptor_dim={args.descriptor_dim}"
|
|
)
|
|
print(
|
|
f" encoder: torchvision VGG16 IMAGENET1K_V1 ({len(imagenet_encoder)} keys)"
|
|
)
|
|
print(
|
|
f" pool/pca: random-init via torch.manual_seed({_SEED}) "
|
|
f"({len(full_state) - len(imagenet_encoder)} keys)"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|