Files
Oleksandr Bezdieniezhnykh ba70381346
ci/woodpecker/push/02-build-push Pipeline failed
Update NetVLAD checkpoint paths and enhance .gitignore
- Changed paths in documentation and configuration files to reflect the new naming convention for the NetVLAD model, transitioning from `models/netvlad/netvlad.pt` to `models/net_vlad/net_vlad.pt`.
- Updated the `.gitignore` to include additional file types and directories related to input data and locally-generated evidence frames.
- Removed the old NetVLAD checkpoint file as part of the transition to the new naming scheme.

These changes ensure consistency across the project and improve the management of generated files.
2026-05-31 19:27:32 +03:00

161 lines
5.4 KiB
Python

#!/usr/bin/env python3
"""AZ-965 — generate a NetVLAD-VGG16 PyTorch state_dict checkpoint.
Pipeline-integration checkpoint for the AZ-839 / AZ-840 e2e fixture.
Composition:
* **Encoder**: ``torchvision.models.vgg16(weights="IMAGENET1K_V1")``
features (BSD-licensed public weights). Layers ``[:-2]`` are loaded
into the project's ``_NetVladVgg16.encoder`` slot.
* **NetVLAD pool**: ``pool.conv`` + ``pool.centroids`` are initialised
deterministically from ``torch.manual_seed(0)`` — UNTRAINED for
retrieval; the architecture-default constructor's distribution is
what we ship.
* **PCA**: ``pca.weight`` + ``pca.bias`` likewise random-init via the
architecture-default constructor — UNTRAINED.
Honest scope:
* The encoder produces real ImageNet-pretrained features and is a
legitimate ImageNet-trained VGG16 backbone.
* The NetVLAD pool + PCA tail are NOT trained for retrieval. The
resulting embeddings are essentially random projections of VGG16
features. The c10 compile + c2 strategy will instantiate and run,
but retrieval results will be effectively random.
* This unblocks the AZ-840 orchestrator's empty-backbones SKIP gate
so the next gate (likely ESKF divergence under garbage retrievals)
can surface as a separate, named failure for follow-up work.
Reproduce: ``python scripts/mk_netvlad_checkpoint.py``.
License: torchvision weights are BSD-3-Clause; this script and the
generated random NetVLAD tail are project-owned. Full provenance in
``_docs/03_ip_attribution/netvlad.md``.
"""
from __future__ import annotations
import argparse
import hashlib
import sys
from pathlib import Path
import torch
import torchvision
_REPO_ROOT = Path(__file__).resolve().parent.parent
if str(_REPO_ROOT / "src") not in sys.path:
sys.path.insert(0, str(_REPO_ROOT / "src"))
from gps_denied_onboard.components.c2_vpr._net_vlad_architecture import ( # noqa: E402
DEFAULT_DESCRIPTOR_DIM,
DEFAULT_ENCODER_DIM,
DEFAULT_NUM_CLUSTERS,
make_net_vlad_vgg16,
)
_DEFAULT_OUTPUT = _REPO_ROOT / "models" / "net_vlad" / "net_vlad.pt"
_SEED = 0
def _parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--output",
type=Path,
default=_DEFAULT_OUTPUT,
help=f"Output .pt path (default: {_DEFAULT_OUTPUT})",
)
parser.add_argument(
"--num-clusters",
type=int,
default=DEFAULT_NUM_CLUSTERS,
)
parser.add_argument(
"--encoder-dim",
type=int,
default=DEFAULT_ENCODER_DIM,
)
parser.add_argument(
"--descriptor-dim",
type=int,
default=DEFAULT_DESCRIPTOR_DIM,
)
return parser.parse_args()
def _load_imagenet_vgg16_features_state(encoder_dim: int) -> dict[str, torch.Tensor]:
"""Return state_dict slice for the project's encoder slot.
The project's ``_NetVladVgg16.encoder`` is
``nn.Sequential(*list(vgg.features.children())[:-2])`` —
everything in ``torchvision.models.vgg16().features`` except the
last two layers (the trailing ReLU + MaxPool2d). We load the
full ``vgg16(weights="IMAGENET1K_V1")``, take its ``.features``,
pass through the same slicing, and prefix the state_dict keys
with ``encoder.``.
"""
vgg = torchvision.models.vgg16(weights="IMAGENET1K_V1")
encoder_features = torch.nn.Sequential(*list(vgg.features.children())[:-2])
out: dict[str, torch.Tensor] = {}
for key, value in encoder_features.state_dict().items():
out[f"encoder.{key}"] = value.detach().clone()
if encoder_dim != 512:
raise SystemExit(
f"Only encoder_dim=512 is supported (VGG16 conv5_3 produces "
f"512 channels); got {encoder_dim}"
)
return out
def main() -> int:
args = _parse_args()
torch.manual_seed(_SEED)
model = make_net_vlad_vgg16(
num_clusters=args.num_clusters,
encoder_dim=args.encoder_dim,
descriptor_dim=args.descriptor_dim,
)
full_state = model.state_dict()
imagenet_encoder = _load_imagenet_vgg16_features_state(args.encoder_dim)
missing = [k for k in imagenet_encoder if k not in full_state]
if missing:
raise SystemExit(
f"Encoder-key mismatch — torchvision VGG16 produced keys not "
f"present in project arch: {missing[:5]}..."
)
for key, tensor in imagenet_encoder.items():
target = full_state[key]
if tensor.shape != target.shape:
raise SystemExit(
f"Encoder shape mismatch at {key}: torchvision="
f"{tuple(tensor.shape)} project={tuple(target.shape)}"
)
full_state[key] = tensor
model.load_state_dict(full_state, strict=True)
args.output.parent.mkdir(parents=True, exist_ok=True)
torch.save(full_state, args.output)
blob = args.output.read_bytes()
sha256 = hashlib.sha256(blob).hexdigest()
print(
f"[mk_netvlad_checkpoint] wrote {args.output} "
f"size={len(blob) / (1024 * 1024):.1f} MiB sha256={sha256}"
)
print(
f" num_clusters={args.num_clusters} encoder_dim={args.encoder_dim} "
f"descriptor_dim={args.descriptor_dim}"
)
print(
f" encoder: torchvision VGG16 IMAGENET1K_V1 ({len(imagenet_encoder)} keys)"
)
print(
f" pool/pca: random-init via torch.manual_seed({_SEED}) "
f"({len(full_state) - len(imagenet_encoder)} keys)"
)
return 0
if __name__ == "__main__":
sys.exit(main())