#!/usr/bin/env python3 """AZ-965 — generate a NetVLAD-VGG16 PyTorch state_dict checkpoint. Pipeline-integration checkpoint for the AZ-839 / AZ-840 e2e fixture. Composition: * **Encoder**: ``torchvision.models.vgg16(weights="IMAGENET1K_V1")`` features (BSD-licensed public weights). Layers ``[:-2]`` are loaded into the project's ``_NetVladVgg16.encoder`` slot. * **NetVLAD pool**: ``pool.conv`` + ``pool.centroids`` are initialised deterministically from ``torch.manual_seed(0)`` — UNTRAINED for retrieval; the architecture-default constructor's distribution is what we ship. * **PCA**: ``pca.weight`` + ``pca.bias`` likewise random-init via the architecture-default constructor — UNTRAINED. Honest scope: * The encoder produces real ImageNet-pretrained features and is a legitimate ImageNet-trained VGG16 backbone. * The NetVLAD pool + PCA tail are NOT trained for retrieval. The resulting embeddings are essentially random projections of VGG16 features. The c10 compile + c2 strategy will instantiate and run, but retrieval results will be effectively random. * This unblocks the AZ-840 orchestrator's empty-backbones SKIP gate so the next gate (likely ESKF divergence under garbage retrievals) can surface as a separate, named failure for follow-up work. Reproduce: ``python scripts/mk_netvlad_checkpoint.py``. License: torchvision weights are BSD-3-Clause; this script and the generated random NetVLAD tail are project-owned. Full provenance in ``_docs/03_ip_attribution/netvlad.md``. """ from __future__ import annotations import argparse import hashlib import sys from pathlib import Path import torch import torchvision _REPO_ROOT = Path(__file__).resolve().parent.parent if str(_REPO_ROOT / "src") not in sys.path: sys.path.insert(0, str(_REPO_ROOT / "src")) from gps_denied_onboard.components.c2_vpr._net_vlad_architecture import ( # noqa: E402 DEFAULT_DESCRIPTOR_DIM, DEFAULT_ENCODER_DIM, DEFAULT_NUM_CLUSTERS, make_net_vlad_vgg16, ) _DEFAULT_OUTPUT = _REPO_ROOT / "models" / "net_vlad" / "net_vlad.pt" _SEED = 0 def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--output", type=Path, default=_DEFAULT_OUTPUT, help=f"Output .pt path (default: {_DEFAULT_OUTPUT})", ) parser.add_argument( "--num-clusters", type=int, default=DEFAULT_NUM_CLUSTERS, ) parser.add_argument( "--encoder-dim", type=int, default=DEFAULT_ENCODER_DIM, ) parser.add_argument( "--descriptor-dim", type=int, default=DEFAULT_DESCRIPTOR_DIM, ) return parser.parse_args() def _load_imagenet_vgg16_features_state(encoder_dim: int) -> dict[str, torch.Tensor]: """Return state_dict slice for the project's encoder slot. The project's ``_NetVladVgg16.encoder`` is ``nn.Sequential(*list(vgg.features.children())[:-2])`` — everything in ``torchvision.models.vgg16().features`` except the last two layers (the trailing ReLU + MaxPool2d). We load the full ``vgg16(weights="IMAGENET1K_V1")``, take its ``.features``, pass through the same slicing, and prefix the state_dict keys with ``encoder.``. """ vgg = torchvision.models.vgg16(weights="IMAGENET1K_V1") encoder_features = torch.nn.Sequential(*list(vgg.features.children())[:-2]) out: dict[str, torch.Tensor] = {} for key, value in encoder_features.state_dict().items(): out[f"encoder.{key}"] = value.detach().clone() if encoder_dim != 512: raise SystemExit( f"Only encoder_dim=512 is supported (VGG16 conv5_3 produces " f"512 channels); got {encoder_dim}" ) return out def main() -> int: args = _parse_args() torch.manual_seed(_SEED) model = make_net_vlad_vgg16( num_clusters=args.num_clusters, encoder_dim=args.encoder_dim, descriptor_dim=args.descriptor_dim, ) full_state = model.state_dict() imagenet_encoder = _load_imagenet_vgg16_features_state(args.encoder_dim) missing = [k for k in imagenet_encoder if k not in full_state] if missing: raise SystemExit( f"Encoder-key mismatch — torchvision VGG16 produced keys not " f"present in project arch: {missing[:5]}..." ) for key, tensor in imagenet_encoder.items(): target = full_state[key] if tensor.shape != target.shape: raise SystemExit( f"Encoder shape mismatch at {key}: torchvision=" f"{tuple(tensor.shape)} project={tuple(target.shape)}" ) full_state[key] = tensor model.load_state_dict(full_state, strict=True) args.output.parent.mkdir(parents=True, exist_ok=True) torch.save(full_state, args.output) blob = args.output.read_bytes() sha256 = hashlib.sha256(blob).hexdigest() print( f"[mk_netvlad_checkpoint] wrote {args.output} " f"size={len(blob) / (1024 * 1024):.1f} MiB sha256={sha256}" ) print( f" num_clusters={args.num_clusters} encoder_dim={args.encoder_dim} " f"descriptor_dim={args.descriptor_dim}" ) print( f" encoder: torchvision VGG16 IMAGENET1K_V1 ({len(imagenet_encoder)} keys)" ) print( f" pool/pca: random-init via torch.manual_seed({_SEED}) " f"({len(full_state) - len(imagenet_encoder)} keys)" ) return 0 if __name__ == "__main__": sys.exit(main())