"""Image Input Pipeline (Component F05).""" import asyncio import os import re from datetime import datetime, timezone import cv2 import numpy as np from gps_denied.schemas.image import ( ImageBatch, ImageData, ImageMetadata, ProcessedBatch, ProcessingStatus, ValidationResult ) class QueueFullError(Exception): pass class ValidationError(Exception): pass class ImageInputPipeline: """Manages ingestion, disk storage, and queuing of UAV image batches.""" def __init__(self, storage_dir: str = "image_storage", max_queue_size: int = 50): self.storage_dir = storage_dir # flight_id -> asyncio.Queue of ImageBatch self._queues: dict[str, asyncio.Queue] = {} self.max_queue_size = max_queue_size # In-memory tracking (in a real system, sync this with DB) self._status: dict[str, dict] = {} # Exact sequence → filename mapping (VO-05: no substring collision) self._sequence_map: dict[str, dict[int, str]] = {} def _get_queue(self, flight_id: str) -> asyncio.Queue: if flight_id not in self._queues: self._queues[flight_id] = asyncio.Queue(maxsize=self.max_queue_size) return self._queues[flight_id] def _init_status(self, flight_id: str): if flight_id not in self._status: self._status[flight_id] = { "total_images": 0, "processed_images": 0, "current_sequence": 1, } def validate_batch(self, batch: ImageBatch) -> ValidationResult: """Validates batch integrity and sequence continuity.""" errors = [] num_images = len(batch.images) if num_images < 1: errors.append("Batch is empty") elif num_images > 100: errors.append("Batch too large") if len(batch.filenames) != num_images: errors.append("Mismatch between filenames and images count") # Naming convention ADxxxxxx.jpg or similar pattern = re.compile(r"^[A-Za-z0-9_-]+\.(jpg|jpeg|png)$", re.IGNORECASE) for fn in batch.filenames: if not pattern.match(fn): errors.append(f"Invalid filename: {fn}") break if batch.start_sequence > batch.end_sequence: errors.append("Start sequence greater than end sequence") return ValidationResult(valid=len(errors) == 0, errors=errors) def queue_batch(self, flight_id: str, batch: ImageBatch) -> bool: """Queues a batch of images for processing.""" val = self.validate_batch(batch) if not val.valid: raise ValidationError(f"Batch validation failed: {val.errors}") q = self._get_queue(flight_id) if q.full(): raise QueueFullError(f"Queue for flight {flight_id} is full") q.put_nowait(batch) self._init_status(flight_id) self._status[flight_id]["total_images"] += len(batch.images) return True async def process_next_batch(self, flight_id: str) -> ProcessedBatch | None: """Dequeues and processing the next batch.""" q = self._get_queue(flight_id) if q.empty(): return None batch: ImageBatch = await q.get() processed_images = [] for i, raw_bytes in enumerate(batch.images): # Decode nparr = np.frombuffer(raw_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if img is None: continue # skip corrupted seq = batch.start_sequence + i fn = batch.filenames[i] h, w = img.shape[:2] meta = ImageMetadata( sequence=seq, filename=fn, dimensions=(w, h), file_size=len(raw_bytes), timestamp=datetime.now(timezone.utc), ) img_data = ImageData( flight_id=flight_id, sequence=seq, filename=fn, image=img, metadata=meta ) processed_images.append(img_data) # VO-05: record exact sequence→filename mapping self._sequence_map.setdefault(flight_id, {})[seq] = fn # Store to disk self.store_images(flight_id, processed_images) self._status[flight_id]["processed_images"] += len(processed_images) q.task_done() return ProcessedBatch( images=processed_images, batch_id=f"batch_{batch.batch_number}", start_sequence=batch.start_sequence, end_sequence=batch.end_sequence ) def store_images(self, flight_id: str, images: list[ImageData]) -> bool: """Persists images to disk.""" flight_dir = os.path.join(self.storage_dir, flight_id) os.makedirs(flight_dir, exist_ok=True) for img in images: path = os.path.join(flight_dir, img.filename) cv2.imwrite(path, img.image) return True def get_next_image(self, flight_id: str) -> ImageData | None: """Gets the next image in sequence for processing.""" self._init_status(flight_id) seq = self._status[flight_id]["current_sequence"] img = self.get_image_by_sequence(flight_id, seq) if img: self._status[flight_id]["current_sequence"] += 1 return img def get_image_by_sequence(self, flight_id: str, sequence: int) -> ImageData | None: """Retrieves a specific image by sequence number (exact match — VO-05).""" flight_dir = os.path.join(self.storage_dir, flight_id) if not os.path.exists(flight_dir): return None # Prefer the exact mapping built during process_next_batch fn = self._sequence_map.get(flight_id, {}).get(sequence) if fn: path = os.path.join(flight_dir, fn) img = cv2.imread(path) if img is not None: h, w = img.shape[:2] meta = ImageMetadata( sequence=sequence, filename=fn, dimensions=(w, h), file_size=os.path.getsize(path), timestamp=datetime.now(timezone.utc), ) return ImageData(flight_id, sequence, fn, img, meta) # Fallback: scan directory for exact filename patterns # (handles images stored before this process started) for fn in os.listdir(flight_dir): base, _ = os.path.splitext(fn) # Accept only if the base name ends with exactly the padded sequence number if base.endswith(f"{sequence:06d}") or base == str(sequence): path = os.path.join(flight_dir, fn) img = cv2.imread(path) if img is not None: h, w = img.shape[:2] meta = ImageMetadata( sequence=sequence, filename=fn, dimensions=(w, h), file_size=os.path.getsize(path), timestamp=datetime.now(timezone.utc), ) return ImageData(flight_id, sequence, fn, img, meta) return None def get_processing_status(self, flight_id: str) -> ProcessingStatus: self._init_status(flight_id) s = self._status[flight_id] q = self._get_queue(flight_id) return ProcessingStatus( flight_id=flight_id, total_images=s["total_images"], processed_images=s["processed_images"], current_sequence=s["current_sequence"], queued_batches=q.qsize(), processing_rate=0.0 # mock )