gps-denied-onboard/src/gps_denied/core/pipeline.py

"""Image Input Pipeline (Component F05)."""

import asyncio
import os
import re
from datetime import datetime, timezone

import cv2
import numpy as np

from gps_denied.schemas.image import (
    ImageBatch, ImageData, ImageMetadata, ProcessedBatch, ProcessingStatus, ValidationResult
)


class QueueFullError(Exception):
    pass

class ValidationError(Exception):
    pass


class ImageInputPipeline:
    """Manages ingestion, disk storage, and queuing of UAV image batches."""

    def __init__(self, storage_dir: str = "image_storage", max_queue_size: int = 50):
        self.storage_dir = storage_dir
        # flight_id -> asyncio.Queue of ImageBatch
        self._queues: dict[str, asyncio.Queue] = {}
        self.max_queue_size = max_queue_size

        # In-memory tracking (in a real system, sync this with DB)
        self._status: dict[str, dict] = {}
        # Exact sequence → filename mapping (VO-05: no substring collision)
        self._sequence_map: dict[str, dict[int, str]] = {}

    def _get_queue(self, flight_id: str) -> asyncio.Queue:
        if flight_id not in self._queues:
            self._queues[flight_id] = asyncio.Queue(maxsize=self.max_queue_size)
        return self._queues[flight_id]

    def _init_status(self, flight_id: str):
        if flight_id not in self._status:
            self._status[flight_id] = {
                "total_images": 0,
                "processed_images": 0,
                "current_sequence": 1,
            }

    def validate_batch(self, batch: ImageBatch) -> ValidationResult:
        """Validates batch integrity and sequence continuity."""
        errors = []

        num_images = len(batch.images)
        if num_images < 1:
            errors.append("Batch is empty")
        elif num_images > 100:
            errors.append("Batch too large")

        if len(batch.filenames) != num_images:
            errors.append("Mismatch between filenames and images count")

        # Naming convention ADxxxxxx.jpg or similar
        pattern = re.compile(r"^[A-Za-z0-9_-]+\.(jpg|jpeg|png)$", re.IGNORECASE)
        for fn in batch.filenames:
            if not pattern.match(fn):
                errors.append(f"Invalid filename: {fn}")
                break

        if batch.start_sequence > batch.end_sequence:
            errors.append("Start sequence greater than end sequence")

        return ValidationResult(valid=len(errors) == 0, errors=errors)

    def queue_batch(self, flight_id: str, batch: ImageBatch) -> bool:
        """Queues a batch of images for processing."""
        val = self.validate_batch(batch)
        if not val.valid:
            raise ValidationError(f"Batch validation failed: {val.errors}")

        q = self._get_queue(flight_id)
        if q.full():
            raise QueueFullError(f"Queue for flight {flight_id} is full")

        q.put_nowait(batch)

        self._init_status(flight_id)
        self._status[flight_id]["total_images"] += len(batch.images)

        return True

    async def process_next_batch(self, flight_id: str) -> ProcessedBatch | None:
        """Dequeues and processing the next batch."""
        q = self._get_queue(flight_id)
        if q.empty():
            return None

        batch: ImageBatch = await q.get()

        processed_images = []
        for i, raw_bytes in enumerate(batch.images):
            # Decode
            nparr = np.frombuffer(raw_bytes, np.uint8)
            img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

            if img is None:
                continue  # skip corrupted

            seq = batch.start_sequence + i
            fn = batch.filenames[i]

            h, w = img.shape[:2]
            meta = ImageMetadata(
                sequence=seq,
                filename=fn,
                dimensions=(w, h),
                file_size=len(raw_bytes),
                timestamp=datetime.now(timezone.utc),
            )

            img_data = ImageData(
                flight_id=flight_id,
                sequence=seq,
                filename=fn,
                image=img,
                metadata=meta
            )
            processed_images.append(img_data)
            # VO-05: record exact sequence→filename mapping
            self._sequence_map.setdefault(flight_id, {})[seq] = fn

        # Store to disk
        self.store_images(flight_id, processed_images)

        self._status[flight_id]["processed_images"] += len(processed_images)
        q.task_done()

        return ProcessedBatch(
            images=processed_images,
            batch_id=f"batch_{batch.batch_number}",
            start_sequence=batch.start_sequence,
            end_sequence=batch.end_sequence
        )

    def store_images(self, flight_id: str, images: list[ImageData]) -> bool:
        """Persists images to disk."""
        flight_dir = os.path.join(self.storage_dir, flight_id)
        os.makedirs(flight_dir, exist_ok=True)

        for img in images:
            path = os.path.join(flight_dir, img.filename)
            cv2.imwrite(path, img.image)

        return True

    def get_next_image(self, flight_id: str) -> ImageData | None:
        """Gets the next image in sequence for processing."""
        self._init_status(flight_id)
        seq = self._status[flight_id]["current_sequence"]

        img = self.get_image_by_sequence(flight_id, seq)
        if img:
            self._status[flight_id]["current_sequence"] += 1

        return img

    def get_image_by_sequence(self, flight_id: str, sequence: int) -> ImageData | None:
        """Retrieves a specific image by sequence number (exact match — VO-05)."""
        flight_dir = os.path.join(self.storage_dir, flight_id)
        if not os.path.exists(flight_dir):
            return None

        # Prefer the exact mapping built during process_next_batch
        fn = self._sequence_map.get(flight_id, {}).get(sequence)
        if fn:
            path = os.path.join(flight_dir, fn)
            img = cv2.imread(path)
            if img is not None:
                h, w = img.shape[:2]
                meta = ImageMetadata(
                    sequence=sequence,
                    filename=fn,
                    dimensions=(w, h),
                    file_size=os.path.getsize(path),
                    timestamp=datetime.now(timezone.utc),
                )
                return ImageData(flight_id, sequence, fn, img, meta)

        # Fallback: scan directory for exact filename patterns
        # (handles images stored before this process started)
        for fn in os.listdir(flight_dir):
            base, _ = os.path.splitext(fn)
            # Accept only if the base name ends with exactly the padded sequence number
            if base.endswith(f"{sequence:06d}") or base == str(sequence):
                path = os.path.join(flight_dir, fn)
                img = cv2.imread(path)
                if img is not None:
                    h, w = img.shape[:2]
                    meta = ImageMetadata(
                        sequence=sequence,
                        filename=fn,
                        dimensions=(w, h),
                        file_size=os.path.getsize(path),
                        timestamp=datetime.now(timezone.utc),
                    )
                    return ImageData(flight_id, sequence, fn, img, meta)

        return None

    def get_processing_status(self, flight_id: str) -> ProcessingStatus:
        self._init_status(flight_id)
        s = self._status[flight_id]
        q = self._get_queue(flight_id)

        return ProcessingStatus(
            flight_id=flight_id,
            total_images=s["total_images"],
            processed_images=s["processed_images"],
            current_sequence=s["current_sequence"],
            queued_batches=q.qsize(),
            processing_rate=0.0  # mock
        )