[AZ-173] [AZ-174] Stream-based detection API and DB-driven AI config

Made-with: Cursor
This commit is contained in:
Oleksandr Bezdieniezhnykh
2026-03-31 06:30:22 +03:00
parent 6547c5903a
commit 6c24d09eab
15 changed files with 562 additions and 105 deletions
-1
View File
@@ -10,7 +10,6 @@ cdef class AIRecognitionConfig:
cdef public int big_image_tile_overlap_percent
cdef public list[str] paths
cdef public int model_batch_size
cdef public double altitude
-4
View File
@@ -7,7 +7,6 @@ cdef class AIRecognitionConfig:
tracking_distance_confidence,
tracking_probability_increase,
tracking_intersection_threshold,
paths,
model_batch_size,
big_image_tile_overlap_percent,
altitude,
@@ -22,7 +21,6 @@ cdef class AIRecognitionConfig:
self.tracking_probability_increase = tracking_probability_increase
self.tracking_intersection_threshold = tracking_intersection_threshold
self.paths = paths
self.model_batch_size = model_batch_size
self.big_image_tile_overlap_percent = big_image_tile_overlap_percent
@@ -37,7 +35,6 @@ cdef class AIRecognitionConfig:
f'intersection_threshold : {self.tracking_intersection_threshold}, '
f'frame_period_recognition : {self.frame_period_recognition}, '
f'big_image_tile_overlap_percent: {self.big_image_tile_overlap_percent}, '
f'paths: {self.paths}, '
f'model_batch_size: {self.model_batch_size}, '
f'altitude: {self.altitude}, '
f'focal_length: {self.focal_length}, '
@@ -55,7 +52,6 @@ cdef class AIRecognitionConfig:
data.get("tracking_probability_increase", 0.0),
data.get("tracking_intersection_threshold", 0.6),
data.get("paths", []),
data.get("model_batch_size", 8),
data.get("big_image_tile_overlap_percent", 20),
+176 -37
View File
@@ -1,7 +1,11 @@
import io
import mimetypes
import threading
from pathlib import Path
import av
import cv2
import numpy as np
cimport constants_inf
from ai_availability_status cimport AIAvailabilityEnum, AIAvailabilityStatus
@@ -13,6 +17,18 @@ from threading import Thread
from engines import EngineClass
def ai_config_from_dict(dict data):
return AIRecognitionConfig.from_dict(data)
def _write_video_bytes_to_path(str path, bytes data, object done_event):
try:
with open(path, 'wb') as f:
f.write(data)
finally:
done_event.set()
cdef class Inference:
cdef LoaderHttpClient loader_client
cdef InferenceEngine engine
@@ -135,6 +151,7 @@ cdef class Inference:
cpdef run_detect(self, dict config_dict, object annotation_callback, object status_callback=None):
cdef list[str] videos = []
cdef list[str] images = []
cdef object media_paths = config_dict.get("paths", [])
cdef AIRecognitionConfig ai_config = AIRecognitionConfig.from_dict(config_dict)
if ai_config is None:
raise Exception('ai recognition config is empty')
@@ -148,7 +165,7 @@ cdef class Inference:
return
self.detection_counts = {}
for p in ai_config.paths:
for p in media_paths:
media_name = Path(<str>p).stem.replace(" ", "")
self.detection_counts[media_name] = 0
if self.is_video(p):
@@ -163,22 +180,147 @@ cdef class Inference:
constants_inf.log(<str>f'run inference on {v}...')
self._process_video(ai_config, v)
cpdef run_detect_image(self, bytes image_bytes, AIRecognitionConfig ai_config, str media_name,
object annotation_callback, object status_callback=None):
cdef list all_frame_data = []
cdef str original_media_name
self._annotation_callback = annotation_callback
self._status_callback = status_callback
self.stop_signal = <bint>False
self.init_ai()
if self.engine is None:
constants_inf.log(<str> "AI engine not available. Conversion may be in progress. Skipping inference.")
return
if not image_bytes:
return
frame = cv2.imdecode(np.frombuffer(image_bytes, dtype=np.uint8), cv2.IMREAD_COLOR)
if frame is None:
constants_inf.logerror(<str>'Failed to decode image bytes')
return
original_media_name = media_name.replace(" ", "")
self.detection_counts = {}
self.detection_counts[original_media_name] = 0
self._tile_detections = {}
self._append_image_frame_entries(ai_config, all_frame_data, frame, original_media_name)
self._finalize_image_inference(ai_config, all_frame_data)
cpdef run_detect_video(self, bytes video_bytes, AIRecognitionConfig ai_config, str media_name, str save_path,
object annotation_callback, object status_callback=None):
cdef str original_media_name
self._annotation_callback = annotation_callback
self._status_callback = status_callback
self.stop_signal = <bint>False
self.init_ai()
if self.engine is None:
constants_inf.log(<str> "AI engine not available. Conversion may be in progress. Skipping inference.")
return
if not video_bytes:
return
original_media_name = media_name.replace(" ", "")
self.detection_counts = {}
self.detection_counts[original_media_name] = 0
writer_done = threading.Event()
wt = threading.Thread(
target=_write_video_bytes_to_path,
args=(save_path, video_bytes, writer_done),
daemon=True,
)
wt.start()
try:
bio = io.BytesIO(video_bytes)
container = av.open(bio)
try:
self._process_video_pyav(ai_config, original_media_name, container)
finally:
container.close()
finally:
writer_done.wait()
wt.join(timeout=3600)
cdef _process_video_pyav(self, AIRecognitionConfig ai_config, str original_media_name, object container):
cdef int frame_count = 0
cdef int batch_count = 0
cdef list batch_frames = []
cdef list[long] batch_timestamps = []
cdef int model_h, model_w
cdef int total_frames
cdef int tf
cdef double duration_sec
cdef double fps
self._previous_annotation = <Annotation>None
model_h, model_w = self.engine.get_input_shape()
streams = container.streams.video
if not streams:
constants_inf.logerror(<str>'No video stream in container')
self.send_detection_status()
return
vstream = streams[0]
total_frames = 0
if vstream.frames is not None and int(vstream.frames) > 0:
total_frames = int(vstream.frames)
else:
duration_sec = 0.0
if vstream.duration is not None and vstream.time_base is not None:
duration_sec = float(vstream.duration * vstream.time_base)
fps = 25.0
if vstream.average_rate is not None:
fps = float(vstream.average_rate)
if duration_sec > 0:
total_frames = int(duration_sec * fps)
if total_frames < 1:
total_frames = 1
tf = total_frames
constants_inf.log(<str>f'Video (PyAV): ~{tf} frames est, {vstream.width}x{vstream.height}')
cdef int effective_batch = min(self.engine.max_batch_size, ai_config.model_batch_size)
if effective_batch < 1:
effective_batch = 1
for av_frame in container.decode(vstream):
if self.stop_signal:
break
frame_count += 1
arr = av_frame.to_ndarray(format='bgr24')
if frame_count % ai_config.frame_period_recognition == 0:
ts_ms = 0
if av_frame.time is not None:
ts_ms = int(av_frame.time * 1000)
elif av_frame.pts is not None and vstream.time_base is not None:
ts_ms = int(float(av_frame.pts) * float(vstream.time_base) * 1000)
batch_frames.append(arr)
batch_timestamps.append(<long>ts_ms)
if len(batch_frames) >= effective_batch:
batch_count += 1
tf = total_frames if total_frames > 0 else max(frame_count, 1)
constants_inf.log(<str>f'Video batch {batch_count}: frame {frame_count}/{tf} ({frame_count*100//tf}%)')
self._process_video_batch(ai_config, batch_frames, batch_timestamps, original_media_name, frame_count, tf, model_w)
batch_frames = []
batch_timestamps = []
if batch_frames:
batch_count += 1
tf = total_frames if total_frames > 0 else max(frame_count, 1)
constants_inf.log(<str>f'Video batch {batch_count} (flush): {len(batch_frames)} remaining frames')
self._process_video_batch(ai_config, batch_frames, batch_timestamps, original_media_name, frame_count, tf, model_w)
constants_inf.log(<str>f'Video done: {frame_count} frames read, {batch_count} batches processed')
self.send_detection_status()
cdef _process_video(self, AIRecognitionConfig ai_config, str video_name):
cdef int frame_count = 0
cdef int batch_count = 0
cdef list batch_frames = []
cdef list[long] batch_timestamps = []
cdef Annotation annotation
cdef int model_h, model_w
cdef str original_media_name
self._previous_annotation = <Annotation>None
model_h, model_w = self.engine.get_input_shape()
original_media_name = Path(<str>video_name).stem.replace(" ", "")
v_input = cv2.VideoCapture(<str>video_name)
if not v_input.isOpened():
constants_inf.logerror(<str>f'Failed to open video: {video_name}')
return
total_frames = int(v_input.get(cv2.CAP_PROP_FRAME_COUNT))
if total_frames < 1:
total_frames = 1
fps = v_input.get(cv2.CAP_PROP_FPS)
width = int(v_input.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(v_input.get(cv2.CAP_PROP_FRAME_HEIGHT))
@@ -201,21 +343,21 @@ cdef class Inference:
if len(batch_frames) >= effective_batch:
batch_count += 1
constants_inf.log(<str>f'Video batch {batch_count}: frame {frame_count}/{total_frames} ({frame_count*100//total_frames}%)')
self._process_video_batch(ai_config, batch_frames, batch_timestamps, video_name, frame_count, total_frames, model_w)
self._process_video_batch(ai_config, batch_frames, batch_timestamps, original_media_name, frame_count, total_frames, model_w)
batch_frames = []
batch_timestamps = []
if batch_frames:
batch_count += 1
constants_inf.log(<str>f'Video batch {batch_count} (flush): {len(batch_frames)} remaining frames')
self._process_video_batch(ai_config, batch_frames, batch_timestamps, video_name, frame_count, total_frames, model_w)
self._process_video_batch(ai_config, batch_frames, batch_timestamps, original_media_name, frame_count, total_frames, model_w)
v_input.release()
constants_inf.log(<str>f'Video done: {frame_count} frames read, {batch_count} batches processed')
self.send_detection_status()
cdef _process_video_batch(self, AIRecognitionConfig ai_config, list batch_frames,
list batch_timestamps, str video_name,
list batch_timestamps, str original_media_name,
int frame_count, int total_frames, int model_w):
cdef Annotation annotation
list_detections = self.engine.process_frames(batch_frames, ai_config)
@@ -225,7 +367,6 @@ cdef class Inference:
for i in range(len(list_detections)):
detections = list_detections[i]
original_media_name = Path(<str>video_name).stem.replace(" ", "")
name = f'{original_media_name}_{constants_inf.format_time(batch_timestamps[i])}'
annotation = Annotation(name, original_media_name, batch_timestamps[i], detections)
@@ -247,56 +388,54 @@ cdef class Inference:
cb = self._annotation_callback
cb(annotation, percent)
cdef _process_images(self, AIRecognitionConfig ai_config, list[str] image_paths):
cdef list all_frame_data = []
cdef _append_image_frame_entries(self, AIRecognitionConfig ai_config, list all_frame_data, frame, str original_media_name):
cdef double ground_sampling_distance
cdef int model_h, model_w
cdef int img_h, img_w
model_h, model_w = self.engine.get_input_shape()
self._tile_detections = {}
for path in image_paths:
frame = cv2.imread(<str>path)
if frame is None:
constants_inf.logerror(<str>f'Failed to read image {path}')
continue
img_h, img_w, _ = frame.shape
original_media_name = Path(<str> path).stem.replace(" ", "")
ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w)
constants_inf.log(<str>f'ground sampling distance: {ground_sampling_distance}')
if img_h <= 1.5 * model_h and img_w <= 1.5 * model_w:
all_frame_data.append((frame, original_media_name, f'{original_media_name}_000000', ground_sampling_distance))
else:
tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance)
constants_inf.log(<str> f'calc tile size: {tile_size}')
res = self.split_to_tiles(frame, path, tile_size, ai_config.big_image_tile_overlap_percent)
for tile_frame, omn, tile_name in res:
all_frame_data.append((tile_frame, omn, tile_name, ground_sampling_distance))
img_h, img_w, _ = frame.shape
ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w)
constants_inf.log(<str>f'ground sampling distance: {ground_sampling_distance}')
if img_h <= 1.5 * model_h and img_w <= 1.5 * model_w:
all_frame_data.append((frame, original_media_name, f'{original_media_name}_000000', ground_sampling_distance))
else:
tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance)
constants_inf.log(<str> f'calc tile size: {tile_size}')
res = self.split_to_tiles(frame, original_media_name, tile_size, ai_config.big_image_tile_overlap_percent)
for tile_frame, omn, tile_name in res:
all_frame_data.append((tile_frame, omn, tile_name, ground_sampling_distance))
cdef _finalize_image_inference(self, AIRecognitionConfig ai_config, list all_frame_data):
if not all_frame_data:
return
frames = [fd[0] for fd in all_frame_data]
all_dets = self.engine.process_frames(frames, ai_config)
for i in range(len(all_dets)):
frame_entry = all_frame_data[i]
f = frame_entry[0]
original_media_name = frame_entry[1]
name = frame_entry[2]
gsd = frame_entry[3]
annotation = Annotation(name, original_media_name, 0, all_dets[i])
if self.is_valid_image_annotation(annotation, gsd, f.shape):
constants_inf.log(<str> f'Detected {annotation}')
_, image = cv2.imencode('.jpg', f)
annotation.image = image.tobytes()
self.on_annotation(annotation)
self.send_detection_status()
cdef _process_images(self, AIRecognitionConfig ai_config, list[str] image_paths):
cdef list all_frame_data = []
self._tile_detections = {}
for path in image_paths:
frame = cv2.imread(<str>path)
if frame is None:
constants_inf.logerror(<str>f'Failed to read image {path}')
continue
original_media_name = Path(<str> path).stem.replace(" ", "")
self._append_image_frame_entries(ai_config, all_frame_data, frame, original_media_name)
self._finalize_image_inference(ai_config, all_frame_data)
cdef send_detection_status(self):
if self._status_callback is not None:
cb = self._status_callback
@@ -304,14 +443,14 @@ cdef class Inference:
cb(media_name, self.detection_counts[media_name])
self.detection_counts.clear()
cdef split_to_tiles(self, frame, path, tile_size, overlap_percent):
constants_inf.log(<str>f'splitting image {path} to tiles...')
cdef split_to_tiles(self, frame, str media_stem, tile_size, overlap_percent):
constants_inf.log(<str>f'splitting image {media_stem} to tiles...')
img_h, img_w, _ = frame.shape
stride_w = int(tile_size * (1 - overlap_percent / 100))
stride_h = int(tile_size * (1 - overlap_percent / 100))
results = []
original_media_name = Path(<str> path).stem.replace(" ", "")
original_media_name = media_stem
for y in range(0, img_h, stride_h):
for x in range(0, img_w, stride_w):
x_end = min(x + tile_size, img_w)
+2
View File
@@ -6,3 +6,5 @@ cdef class LoaderHttpClient:
cdef str base_url
cdef LoadResult load_big_small_resource(self, str filename, str directory)
cdef LoadResult upload_big_small_resource(self, bytes content, str filename, str directory)
cpdef object fetch_user_ai_settings(self, str user_id, str bearer_token)
cpdef object fetch_media_path(self, str media_id, str bearer_token)
+35
View File
@@ -41,3 +41,38 @@ cdef class LoaderHttpClient:
except Exception as e:
logger.error(f"LoaderHttpClient.upload_big_small_resource failed: {e}")
return LoadResult(str(e))
cpdef object fetch_user_ai_settings(self, str user_id, str bearer_token):
try:
headers = {}
if bearer_token:
headers["Authorization"] = f"Bearer {bearer_token}"
response = requests.get(
f"{self.base_url}/api/users/{user_id}/ai-settings",
headers=headers,
timeout=30,
)
if response.status_code != 200:
return None
return response.json()
except Exception as e:
logger.error(f"LoaderHttpClient.fetch_user_ai_settings failed: {e}")
return None
cpdef object fetch_media_path(self, str media_id, str bearer_token):
try:
headers = {}
if bearer_token:
headers["Authorization"] = f"Bearer {bearer_token}"
response = requests.get(
f"{self.base_url}/api/media/{media_id}",
headers=headers,
timeout=30,
)
if response.status_code != 200:
return None
data = response.json()
return data.get("path")
except Exception as e:
logger.error(f"LoaderHttpClient.fetch_media_path failed: {e}")
return None
+158 -23
View File
@@ -4,10 +4,10 @@ import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Optional
from typing import Annotated, Optional
import requests as http_requests
from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
from fastapi import Body, FastAPI, UploadFile, File, Form, HTTPException, Request
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
@@ -20,6 +20,7 @@ LOADER_URL = os.environ.get("LOADER_URL", "http://loader:8080")
ANNOTATIONS_URL = os.environ.get("ANNOTATIONS_URL", "http://annotations:8080")
loader_client = LoaderHttpClient(LOADER_URL)
annotations_client = LoaderHttpClient(ANNOTATIONS_URL)
inference = None
_event_queues: list[asyncio.Queue] = []
_active_detections: dict[str, asyncio.Task] = {}
@@ -60,6 +61,29 @@ class TokenManager:
except Exception:
return None
@staticmethod
def decode_user_id(token: str) -> Optional[str]:
try:
payload = token.split(".")[1]
padding = 4 - len(payload) % 4
if padding != 4:
payload += "=" * padding
data = json.loads(base64.urlsafe_b64decode(payload))
uid = (
data.get("sub")
or data.get("userId")
or data.get("user_id")
or data.get("nameid")
or data.get(
"http://schemas.xmlsoap.org/ws/2005/05/identity/claims/nameidentifier"
)
)
if uid is None:
return None
return str(uid)
except Exception:
return None
def get_inference():
global inference
@@ -105,7 +129,115 @@ class AIConfigDto(BaseModel):
altitude: float = 400
focal_length: float = 24
sensor_width: float = 23.5
paths: list[str] = []
_AI_SETTINGS_FIELD_KEYS = (
(
"frame_period_recognition",
("frame_period_recognition", "framePeriodRecognition", "FramePeriodRecognition"),
),
(
"frame_recognition_seconds",
("frame_recognition_seconds", "frameRecognitionSeconds", "FrameRecognitionSeconds"),
),
(
"probability_threshold",
("probability_threshold", "probabilityThreshold", "ProbabilityThreshold"),
),
(
"tracking_distance_confidence",
(
"tracking_distance_confidence",
"trackingDistanceConfidence",
"TrackingDistanceConfidence",
),
),
(
"tracking_probability_increase",
(
"tracking_probability_increase",
"trackingProbabilityIncrease",
"TrackingProbabilityIncrease",
),
),
(
"tracking_intersection_threshold",
(
"tracking_intersection_threshold",
"trackingIntersectionThreshold",
"TrackingIntersectionThreshold",
),
),
(
"model_batch_size",
("model_batch_size", "modelBatchSize", "ModelBatchSize"),
),
(
"big_image_tile_overlap_percent",
(
"big_image_tile_overlap_percent",
"bigImageTileOverlapPercent",
"BigImageTileOverlapPercent",
),
),
(
"altitude",
("altitude", "Altitude"),
),
(
"focal_length",
("focal_length", "focalLength", "FocalLength"),
),
(
"sensor_width",
("sensor_width", "sensorWidth", "SensorWidth"),
),
)
def _merged_annotation_settings_payload(raw: object) -> dict:
if not raw or not isinstance(raw, dict):
return {}
merged = dict(raw)
inner = raw.get("aiRecognitionSettings")
if isinstance(inner, dict):
merged.update(inner)
cam = raw.get("cameraSettings")
if isinstance(cam, dict):
merged.update(cam)
out = {}
for snake, aliases in _AI_SETTINGS_FIELD_KEYS:
for key in aliases:
if key in merged and merged[key] is not None:
out[snake] = merged[key]
break
return out
def _build_media_detect_config_dict(
media_id: str,
token_mgr: Optional[TokenManager],
override: Optional[AIConfigDto],
) -> dict:
cfg: dict = {}
bearer = ""
if token_mgr:
bearer = token_mgr.get_valid_token()
uid = TokenManager.decode_user_id(token_mgr.access_token)
if uid:
raw = annotations_client.fetch_user_ai_settings(uid, bearer)
cfg.update(_merged_annotation_settings_payload(raw))
if override is not None:
for k, v in override.model_dump(exclude_defaults=True).items():
cfg[k] = v
media_path = annotations_client.fetch_media_path(media_id, bearer)
if not media_path:
raise HTTPException(
status_code=503,
detail="Could not resolve media path from annotations service",
)
cfg["paths"] = [media_path]
return cfg
def detection_to_dto(det) -> DetectionDto:
@@ -150,9 +282,11 @@ async def detect_image(
file: UploadFile = File(...),
config: Optional[str] = Form(None),
):
import tempfile
import cv2
import numpy as np
from pathlib import Path
from inference import ai_config_from_dict
image_bytes = await file.read()
if not image_bytes:
@@ -166,21 +300,21 @@ async def detect_image(
if config:
config_dict = json.loads(config)
suffix = os.path.splitext(file.filename or "upload.jpg")[1] or ".jpg"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
media_name = Path(file.filename or "upload.jpg").stem.replace(" ", "")
loop = asyncio.get_event_loop()
inf = get_inference()
results = []
def on_annotation(annotation, percent):
results.extend(annotation.detections)
ai_cfg = ai_config_from_dict(config_dict)
def run_img():
inf.run_detect_image(image_bytes, ai_cfg, media_name, on_annotation)
try:
tmp.write(image_bytes)
tmp.close()
config_dict["paths"] = [tmp.name]
loop = asyncio.get_event_loop()
inf = get_inference()
results = []
def on_annotation(annotation, percent):
results.extend(annotation.detections)
await loop.run_in_executor(executor, inf.run_detect, config_dict, on_annotation)
await loop.run_in_executor(executor, run_img)
return [detection_to_dto(d) for d in results]
except RuntimeError as e:
if "not available" in str(e):
@@ -188,8 +322,6 @@ async def detect_image(
raise HTTPException(status_code=422, detail=str(e))
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e))
finally:
os.unlink(tmp.name)
def _post_annotation_to_service(token_mgr: TokenManager, media_id: str,
@@ -216,7 +348,11 @@ def _post_annotation_to_service(token_mgr: TokenManager, media_id: str,
@app.post("/detect/{media_id}")
async def detect_media(media_id: str, request: Request, config: Optional[AIConfigDto] = None):
async def detect_media(
media_id: str,
request: Request,
config: Annotated[Optional[AIConfigDto], Body()] = None,
):
existing = _active_detections.get(media_id)
if existing is not None and not existing.done():
raise HTTPException(status_code=409, detail="Detection already in progress for this media")
@@ -226,8 +362,7 @@ async def detect_media(media_id: str, request: Request, config: Optional[AIConfi
refresh_token = request.headers.get("x-refresh-token", "")
token_mgr = TokenManager(access_token, refresh_token) if access_token else None
cfg = config or AIConfigDto()
config_dict = cfg.model_dump()
config_dict = _build_media_detect_config_dict(media_id, token_mgr, config)
async def run_detection():
loop = asyncio.get_event_loop()