gps-denied-onboard/scripts/mk_netvlad_checkpoint.py

#!/usr/bin/env python3
"""AZ-965 — generate a NetVLAD-VGG16 PyTorch state_dict checkpoint.

Pipeline-integration checkpoint for the AZ-839 / AZ-840 e2e fixture.
Composition:

* **Encoder**: ``torchvision.models.vgg16(weights="IMAGENET1K_V1")``
  features (BSD-licensed public weights). Layers ``[:-2]`` are loaded
  into the project's ``_NetVladVgg16.encoder`` slot.
* **NetVLAD pool**: ``pool.conv`` + ``pool.centroids`` are initialised
  deterministically from ``torch.manual_seed(0)`` — UNTRAINED for
  retrieval; the architecture-default constructor's distribution is
  what we ship.
* **PCA**: ``pca.weight`` + ``pca.bias`` likewise random-init via the
  architecture-default constructor — UNTRAINED.

Honest scope:

* The encoder produces real ImageNet-pretrained features and is a
  legitimate ImageNet-trained VGG16 backbone.
* The NetVLAD pool + PCA tail are NOT trained for retrieval. The
  resulting embeddings are essentially random projections of VGG16
  features. The c10 compile + c2 strategy will instantiate and run,
  but retrieval results will be effectively random.
* This unblocks the AZ-840 orchestrator's empty-backbones SKIP gate
  so the next gate (likely ESKF divergence under garbage retrievals)
  can surface as a separate, named failure for follow-up work.

Reproduce: ``python scripts/mk_netvlad_checkpoint.py``.

License: torchvision weights are BSD-3-Clause; this script and the
generated random NetVLAD tail are project-owned. Full provenance in
``_docs/03_ip_attribution/netvlad.md``.
"""

from __future__ import annotations

import argparse
import hashlib
import sys
from pathlib import Path

import torch
import torchvision

_REPO_ROOT = Path(__file__).resolve().parent.parent
if str(_REPO_ROOT / "src") not in sys.path:
    sys.path.insert(0, str(_REPO_ROOT / "src"))

from gps_denied_onboard.components.c2_vpr._net_vlad_architecture import (  # noqa: E402
    DEFAULT_DESCRIPTOR_DIM,
    DEFAULT_ENCODER_DIM,
    DEFAULT_NUM_CLUSTERS,
    make_net_vlad_vgg16,
)


_DEFAULT_OUTPUT = _REPO_ROOT / "models" / "netvlad" / "netvlad.pt"
_SEED = 0


def _parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--output",
        type=Path,
        default=_DEFAULT_OUTPUT,
        help=f"Output .pt path (default: {_DEFAULT_OUTPUT})",
    )
    parser.add_argument(
        "--num-clusters",
        type=int,
        default=DEFAULT_NUM_CLUSTERS,
    )
    parser.add_argument(
        "--encoder-dim",
        type=int,
        default=DEFAULT_ENCODER_DIM,
    )
    parser.add_argument(
        "--descriptor-dim",
        type=int,
        default=DEFAULT_DESCRIPTOR_DIM,
    )
    return parser.parse_args()


def _load_imagenet_vgg16_features_state(encoder_dim: int) -> dict[str, torch.Tensor]:
    """Return state_dict slice for the project's encoder slot.

    The project's ``_NetVladVgg16.encoder`` is
    ``nn.Sequential(*list(vgg.features.children())[:-2])`` —
    everything in ``torchvision.models.vgg16().features`` except the
    last two layers (the trailing ReLU + MaxPool2d). We load the
    full ``vgg16(weights="IMAGENET1K_V1")``, take its ``.features``,
    pass through the same slicing, and prefix the state_dict keys
    with ``encoder.``.
    """
    vgg = torchvision.models.vgg16(weights="IMAGENET1K_V1")
    encoder_features = torch.nn.Sequential(*list(vgg.features.children())[:-2])
    out: dict[str, torch.Tensor] = {}
    for key, value in encoder_features.state_dict().items():
        out[f"encoder.{key}"] = value.detach().clone()
    if encoder_dim != 512:
        raise SystemExit(
            f"Only encoder_dim=512 is supported (VGG16 conv5_3 produces "
            f"512 channels); got {encoder_dim}"
        )
    return out


def main() -> int:
    args = _parse_args()
    torch.manual_seed(_SEED)
    model = make_net_vlad_vgg16(
        num_clusters=args.num_clusters,
        encoder_dim=args.encoder_dim,
        descriptor_dim=args.descriptor_dim,
    )
    full_state = model.state_dict()
    imagenet_encoder = _load_imagenet_vgg16_features_state(args.encoder_dim)
    missing = [k for k in imagenet_encoder if k not in full_state]
    if missing:
        raise SystemExit(
            f"Encoder-key mismatch — torchvision VGG16 produced keys not "
            f"present in project arch: {missing[:5]}..."
        )
    for key, tensor in imagenet_encoder.items():
        target = full_state[key]
        if tensor.shape != target.shape:
            raise SystemExit(
                f"Encoder shape mismatch at {key}: torchvision="
                f"{tuple(tensor.shape)} project={tuple(target.shape)}"
            )
        full_state[key] = tensor
    model.load_state_dict(full_state, strict=True)
    args.output.parent.mkdir(parents=True, exist_ok=True)
    torch.save(full_state, args.output)
    blob = args.output.read_bytes()
    sha256 = hashlib.sha256(blob).hexdigest()
    print(
        f"[mk_netvlad_checkpoint] wrote {args.output}  "
        f"size={len(blob) / (1024 * 1024):.1f} MiB  sha256={sha256}"
    )
    print(
        f"  num_clusters={args.num_clusters} encoder_dim={args.encoder_dim} "
        f"descriptor_dim={args.descriptor_dim}"
    )
    print(
        f"  encoder: torchvision VGG16 IMAGENET1K_V1 ({len(imagenet_encoder)} keys)"
    )
    print(
        f"  pool/pca: random-init via torch.manual_seed({_SEED}) "
        f"({len(full_state) - len(imagenet_encoder)} keys)"
    )
    return 0


if __name__ == "__main__":
    sys.exit(main())