mirror of
https://github.com/azaion/gps-denied-onboard.git
synced 2026-06-21 11:11:12 +00:00
ba70381346
ci/woodpecker/push/02-build-push Pipeline failed
- Changed paths in documentation and configuration files to reflect the new naming convention for the NetVLAD model, transitioning from `models/netvlad/netvlad.pt` to `models/net_vlad/net_vlad.pt`. - Updated the `.gitignore` to include additional file types and directories related to input data and locally-generated evidence frames. - Removed the old NetVLAD checkpoint file as part of the transition to the new naming scheme. These changes ensure consistency across the project and improve the management of generated files.
161 lines
5.4 KiB
Python
161 lines
5.4 KiB
Python
#!/usr/bin/env python3
|
|
"""AZ-965 — generate a NetVLAD-VGG16 PyTorch state_dict checkpoint.
|
|
|
|
Pipeline-integration checkpoint for the AZ-839 / AZ-840 e2e fixture.
|
|
Composition:
|
|
|
|
* **Encoder**: ``torchvision.models.vgg16(weights="IMAGENET1K_V1")``
|
|
features (BSD-licensed public weights). Layers ``[:-2]`` are loaded
|
|
into the project's ``_NetVladVgg16.encoder`` slot.
|
|
* **NetVLAD pool**: ``pool.conv`` + ``pool.centroids`` are initialised
|
|
deterministically from ``torch.manual_seed(0)`` — UNTRAINED for
|
|
retrieval; the architecture-default constructor's distribution is
|
|
what we ship.
|
|
* **PCA**: ``pca.weight`` + ``pca.bias`` likewise random-init via the
|
|
architecture-default constructor — UNTRAINED.
|
|
|
|
Honest scope:
|
|
|
|
* The encoder produces real ImageNet-pretrained features and is a
|
|
legitimate ImageNet-trained VGG16 backbone.
|
|
* The NetVLAD pool + PCA tail are NOT trained for retrieval. The
|
|
resulting embeddings are essentially random projections of VGG16
|
|
features. The c10 compile + c2 strategy will instantiate and run,
|
|
but retrieval results will be effectively random.
|
|
* This unblocks the AZ-840 orchestrator's empty-backbones SKIP gate
|
|
so the next gate (likely ESKF divergence under garbage retrievals)
|
|
can surface as a separate, named failure for follow-up work.
|
|
|
|
Reproduce: ``python scripts/mk_netvlad_checkpoint.py``.
|
|
|
|
License: torchvision weights are BSD-3-Clause; this script and the
|
|
generated random NetVLAD tail are project-owned. Full provenance in
|
|
``_docs/03_ip_attribution/netvlad.md``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import hashlib
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import torch
|
|
import torchvision
|
|
|
|
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
if str(_REPO_ROOT / "src") not in sys.path:
|
|
sys.path.insert(0, str(_REPO_ROOT / "src"))
|
|
|
|
from gps_denied_onboard.components.c2_vpr._net_vlad_architecture import ( # noqa: E402
|
|
DEFAULT_DESCRIPTOR_DIM,
|
|
DEFAULT_ENCODER_DIM,
|
|
DEFAULT_NUM_CLUSTERS,
|
|
make_net_vlad_vgg16,
|
|
)
|
|
|
|
|
|
_DEFAULT_OUTPUT = _REPO_ROOT / "models" / "net_vlad" / "net_vlad.pt"
|
|
_SEED = 0
|
|
|
|
|
|
def _parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=_DEFAULT_OUTPUT,
|
|
help=f"Output .pt path (default: {_DEFAULT_OUTPUT})",
|
|
)
|
|
parser.add_argument(
|
|
"--num-clusters",
|
|
type=int,
|
|
default=DEFAULT_NUM_CLUSTERS,
|
|
)
|
|
parser.add_argument(
|
|
"--encoder-dim",
|
|
type=int,
|
|
default=DEFAULT_ENCODER_DIM,
|
|
)
|
|
parser.add_argument(
|
|
"--descriptor-dim",
|
|
type=int,
|
|
default=DEFAULT_DESCRIPTOR_DIM,
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def _load_imagenet_vgg16_features_state(encoder_dim: int) -> dict[str, torch.Tensor]:
|
|
"""Return state_dict slice for the project's encoder slot.
|
|
|
|
The project's ``_NetVladVgg16.encoder`` is
|
|
``nn.Sequential(*list(vgg.features.children())[:-2])`` —
|
|
everything in ``torchvision.models.vgg16().features`` except the
|
|
last two layers (the trailing ReLU + MaxPool2d). We load the
|
|
full ``vgg16(weights="IMAGENET1K_V1")``, take its ``.features``,
|
|
pass through the same slicing, and prefix the state_dict keys
|
|
with ``encoder.``.
|
|
"""
|
|
vgg = torchvision.models.vgg16(weights="IMAGENET1K_V1")
|
|
encoder_features = torch.nn.Sequential(*list(vgg.features.children())[:-2])
|
|
out: dict[str, torch.Tensor] = {}
|
|
for key, value in encoder_features.state_dict().items():
|
|
out[f"encoder.{key}"] = value.detach().clone()
|
|
if encoder_dim != 512:
|
|
raise SystemExit(
|
|
f"Only encoder_dim=512 is supported (VGG16 conv5_3 produces "
|
|
f"512 channels); got {encoder_dim}"
|
|
)
|
|
return out
|
|
|
|
|
|
def main() -> int:
|
|
args = _parse_args()
|
|
torch.manual_seed(_SEED)
|
|
model = make_net_vlad_vgg16(
|
|
num_clusters=args.num_clusters,
|
|
encoder_dim=args.encoder_dim,
|
|
descriptor_dim=args.descriptor_dim,
|
|
)
|
|
full_state = model.state_dict()
|
|
imagenet_encoder = _load_imagenet_vgg16_features_state(args.encoder_dim)
|
|
missing = [k for k in imagenet_encoder if k not in full_state]
|
|
if missing:
|
|
raise SystemExit(
|
|
f"Encoder-key mismatch — torchvision VGG16 produced keys not "
|
|
f"present in project arch: {missing[:5]}..."
|
|
)
|
|
for key, tensor in imagenet_encoder.items():
|
|
target = full_state[key]
|
|
if tensor.shape != target.shape:
|
|
raise SystemExit(
|
|
f"Encoder shape mismatch at {key}: torchvision="
|
|
f"{tuple(tensor.shape)} project={tuple(target.shape)}"
|
|
)
|
|
full_state[key] = tensor
|
|
model.load_state_dict(full_state, strict=True)
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
torch.save(full_state, args.output)
|
|
blob = args.output.read_bytes()
|
|
sha256 = hashlib.sha256(blob).hexdigest()
|
|
print(
|
|
f"[mk_netvlad_checkpoint] wrote {args.output} "
|
|
f"size={len(blob) / (1024 * 1024):.1f} MiB sha256={sha256}"
|
|
)
|
|
print(
|
|
f" num_clusters={args.num_clusters} encoder_dim={args.encoder_dim} "
|
|
f"descriptor_dim={args.descriptor_dim}"
|
|
)
|
|
print(
|
|
f" encoder: torchvision VGG16 IMAGENET1K_V1 ({len(imagenet_encoder)} keys)"
|
|
)
|
|
print(
|
|
f" pool/pca: random-init via torch.manual_seed({_SEED}) "
|
|
f"({len(full_state) - len(imagenet_encoder)} keys)"
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|