From 47de31307bd291e15e14b9a9a4bb2f6b6e8c6d86 Mon Sep 17 00:00:00 2001
From: Oleksandr Bezdieniezhnykh <zxsanny@gmail.com>
Date: Mon, 17 Nov 2025 07:46:11 +0200
Subject: [PATCH] add MediaFile

---
 Azaion.Inference/inference.pyx | 107 ++++++++++++++++-----------------
 1 file changed, 51 insertions(+), 56 deletions(-)
diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx
index 15b9bb9..39fe2fe 100644
--- a/Azaion.Inference/inference.pyx
+++ b/Azaion.Inference/inference.pyx
@@ -258,13 +258,13 @@ cdef class Inference:
 
     cdef _process_video(self, RemoteCommand cmd, AIRecognitionConfig ai_config, str video_name):
         cdef int frame_count = 0
-        cdef list batch_frames = []
-        cdef list[int] batch_timestamps = []
-        cdef Annotation annotation
+        cdef list frame_data = []
         self._previous_annotation = None
+        self._tile_detections = {}
 
-
+        original_media_name = Path(<str>video_name).stem.replace(" ", "")
         v_input = cv2.VideoCapture(<str>video_name)
+        
         while v_input.isOpened() and not self.stop_signal:
             ret, frame = v_input.read()
             if not ret or frame is None:
@@ -272,30 +272,17 @@ cdef class Inference:
 
             frame_count += 1
             if frame_count % ai_config.frame_period_recognition == 0:
-                batch_frames.append(frame)
-                batch_timestamps.append(int(v_input.get(cv2.CAP_PROP_POS_MSEC)))
+                timestamp = int(v_input.get(cv2.CAP_PROP_POS_MSEC))
+                tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, timestamp, ai_config)
+                frame_data.extend(tiles)
 
-            if len(batch_frames) == self.engine.get_batch_size():
-                input_blob = self.preprocess(batch_frames)
+            if len(frame_data) >= self.engine.get_batch_size():
+                self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True)
+                frame_data.clear()
 
-                outputs = self.engine.run(input_blob)
+        if frame_data:
+            self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True)
 
-                list_detections = self.postprocess(outputs, ai_config)
-                for i in range(len(list_detections)):
-                    detections = list_detections[i]
-
-                    original_media_name = Path(<str>video_name).stem.replace(" ", "")
-                    name = f'{original_media_name}_{constants_inf.format_time(batch_timestamps[i])}'
-                    annotation = Annotation(name, original_media_name, batch_timestamps[i], detections)
-
-                    if self.is_valid_video_annotation(annotation, ai_config):
-                        _, image = cv2.imencode('.jpg', batch_frames[i])
-                        annotation.image = image.tobytes()
-                        self._previous_annotation = annotation
-                        self.on_annotation(cmd, annotation)
-
-                batch_frames.clear()
-                batch_timestamps.clear()
         v_input.release()
         self.send_detection_status(cmd.client_id)
 
@@ -310,29 +297,20 @@ cdef class Inference:
         for path in image_paths:
             frame_data = []
             frame = cv2.imread(<str>path)
-            img_h, img_w, _ = frame.shape
             if frame is None:
                 constants_inf.logerror(<str>f'Failed to read image {path}')
                 continue
             original_media_name = Path(<str> path).stem.replace(" ", "")
 
-            ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w)
+            tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, 0, ai_config)
             constants_inf.log(<str>f'ground sampling distance: {ground_sampling_distance}')
-
-            if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width:
-                frame_data.append((frame, original_media_name, f'{original_media_name}_000000'))
-            else:
-                tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance)
-                constants_inf.log(<str> f'calc tile size: {tile_size}')
-                res = self.split_to_tiles(frame, path, tile_size, ai_config.big_image_tile_overlap_percent)
-                frame_data.extend(res)
+            frame_data.extend(tiles)
+            
             if len(frame_data) > self.engine.get_batch_size():
-                for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()):
-                    self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance)
+                self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False)
                 self.send_detection_status(cmd.client_id)
 
-        for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()):
-            self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance)
+        self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False)
         self.send_detection_status(cmd.client_id)
 
     cdef send_detection_status(self, client_id):
@@ -347,47 +325,67 @@ cdef class Inference:
                 pass
         self.detection_counts.clear()
 
-    cdef split_to_tiles(self, frame, path, tile_size, overlap_percent):
-        constants_inf.log(<str>f'splitting image {path} to tiles...')
+    cdef prepare_frame_or_tiles(self, frame, str original_media_name, int timestamp, AIRecognitionConfig ai_config):
+        img_h, img_w, _ = frame.shape
+        ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w)
+        
+        if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width:
+            name = f'{original_media_name}_{constants_inf.format_time(timestamp)}'
+            return [(frame, original_media_name, name, timestamp)], ground_sampling_distance
+        else:
+            tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance)
+            tiles = self.split_to_tiles(frame, original_media_name, timestamp, tile_size, ai_config.big_image_tile_overlap_percent)
+            return tiles, ground_sampling_distance
+
+    cdef process_frame_data_batches(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video):
+        for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()):
+            self._process_batch(cmd, ai_config, chunk, ground_sampling_distance, is_video)
+
+    cdef split_to_tiles(self, frame, str original_media_name, int timestamp, tile_size, overlap_percent):
         img_h, img_w, _ = frame.shape
         stride_w = int(tile_size * (1 - overlap_percent / 100))
         stride_h = int(tile_size * (1 - overlap_percent / 100))
 
         results = []
-        original_media_name = Path(<str> path).stem.replace(" ", "")
         for y in range(0, img_h, stride_h):
             for x in range(0, img_w, stride_w):
                 x_end = min(x + tile_size, img_w)
                 y_end = min(y + tile_size, img_h)
 
-                # correct x,y for the close-to-border tiles
                 if x_end - x < tile_size:
                     if img_w - (x - stride_w) <= tile_size:
-                        continue  # the previous tile already covered the last gap
+                        continue
                     x = img_w - tile_size
                 if y_end - y < tile_size:
                     if img_h - (y - stride_h) <= tile_size:
-                        continue  # the previous tile already covered the last gap
+                        continue
                     y = img_h - tile_size
 
                 tile = frame[y:y_end, x:x_end]
-                name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_000000'
-                results.append((tile, original_media_name, name))
+                name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_{constants_inf.format_time(timestamp)}'
+                results.append((tile, original_media_name, name, timestamp))
         return results
 
-    cdef _process_images_inner(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance):
-        cdef list frames, original_media_names, names
+    cdef _process_batch(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video):
+        cdef list frames, original_media_names, names, timestamps
         cdef Annotation annotation
         cdef int i
-        frames, original_media_names, names = map(list, zip(*frame_data))
+        
+        frames, original_media_names, names, timestamps = map(list, zip(*frame_data))
 
         input_blob = self.preprocess(frames)
         outputs = self.engine.run(input_blob)
 
         list_detections = self.postprocess(outputs, ai_config)
         for i in range(len(list_detections)):
-            annotation = Annotation(names[i], original_media_names[i], 0, list_detections[i])
-            if self.is_valid_image_annotation(annotation, ground_sampling_distance, frames[i].shape):
+            annotation = Annotation(names[i], original_media_names[i], timestamps[i], list_detections[i])
+            
+            if self.check_valid_sizes(annotation, ground_sampling_distance, frames[i].shape):
+                if is_video:
+                    if not self.is_valid_video_annotation(annotation, ai_config):
+                        continue
+                    self._previous_annotation = annotation
+                
                 constants_inf.log(<str> f'Detected {annotation}')
                 _, image = cv2.imencode('.jpg', frames[i])
                 annotation.image = image.tobytes()
@@ -423,7 +421,7 @@ cdef class Inference:
 
         annotation.detections = unique_detections
 
-    cdef bint is_valid_image_annotation(self, Annotation annotation, double ground_sampling_distance, frame_shape):
+    cdef bint check_valid_sizes(self, Annotation annotation, double ground_sampling_distance, frame_shape):
         if constants_inf.SPLIT_SUFFIX in annotation.name:
             self.remove_tiled_duplicates(annotation)
         img_h, img_w, _ = frame_shape
@@ -442,7 +440,6 @@ cdef class Inference:
             else:
                 constants_inf.log(<str> f'Removed ({m_w} {m_h}) > {max_size}. class: {constants_inf.annotations_dict[det.cls].name}')
 
-        # Replace the old list with the new, filtered one
         annotation.detections = valid_detections
 
         if not annotation.detections:
@@ -450,8 +447,6 @@ cdef class Inference:
         return True
 
     cdef bint is_valid_video_annotation(self, Annotation annotation, AIRecognitionConfig ai_config):
-        if constants_inf.SPLIT_SUFFIX in annotation.name:
-            self.remove_tiled_duplicates(annotation)
         if not annotation.detections:
             return False