From 47de31307bd291e15e14b9a9a4bb2f6b6e8c6d86 Mon Sep 17 00:00:00 2001 From: Oleksandr Bezdieniezhnykh Date: Mon, 17 Nov 2025 07:46:11 +0200 Subject: [PATCH] add MediaFile --- Azaion.Inference/inference.pyx | 107 ++++++++++++++++----------------- 1 file changed, 51 insertions(+), 56 deletions(-) diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx index 15b9bb9..39fe2fe 100644 --- a/Azaion.Inference/inference.pyx +++ b/Azaion.Inference/inference.pyx @@ -258,13 +258,13 @@ cdef class Inference: cdef _process_video(self, RemoteCommand cmd, AIRecognitionConfig ai_config, str video_name): cdef int frame_count = 0 - cdef list batch_frames = [] - cdef list[int] batch_timestamps = [] - cdef Annotation annotation + cdef list frame_data = [] self._previous_annotation = None + self._tile_detections = {} - + original_media_name = Path(video_name).stem.replace(" ", "") v_input = cv2.VideoCapture(video_name) + while v_input.isOpened() and not self.stop_signal: ret, frame = v_input.read() if not ret or frame is None: @@ -272,30 +272,17 @@ cdef class Inference: frame_count += 1 if frame_count % ai_config.frame_period_recognition == 0: - batch_frames.append(frame) - batch_timestamps.append(int(v_input.get(cv2.CAP_PROP_POS_MSEC))) + timestamp = int(v_input.get(cv2.CAP_PROP_POS_MSEC)) + tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, timestamp, ai_config) + frame_data.extend(tiles) - if len(batch_frames) == self.engine.get_batch_size(): - input_blob = self.preprocess(batch_frames) + if len(frame_data) >= self.engine.get_batch_size(): + self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True) + frame_data.clear() - outputs = self.engine.run(input_blob) + if frame_data: + self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True) - list_detections = self.postprocess(outputs, ai_config) - for i in range(len(list_detections)): - detections = list_detections[i] - - original_media_name = Path(video_name).stem.replace(" ", "") - name = f'{original_media_name}_{constants_inf.format_time(batch_timestamps[i])}' - annotation = Annotation(name, original_media_name, batch_timestamps[i], detections) - - if self.is_valid_video_annotation(annotation, ai_config): - _, image = cv2.imencode('.jpg', batch_frames[i]) - annotation.image = image.tobytes() - self._previous_annotation = annotation - self.on_annotation(cmd, annotation) - - batch_frames.clear() - batch_timestamps.clear() v_input.release() self.send_detection_status(cmd.client_id) @@ -310,29 +297,20 @@ cdef class Inference: for path in image_paths: frame_data = [] frame = cv2.imread(path) - img_h, img_w, _ = frame.shape if frame is None: constants_inf.logerror(f'Failed to read image {path}') continue original_media_name = Path( path).stem.replace(" ", "") - ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w) + tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, 0, ai_config) constants_inf.log(f'ground sampling distance: {ground_sampling_distance}') - - if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width: - frame_data.append((frame, original_media_name, f'{original_media_name}_000000')) - else: - tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance) - constants_inf.log( f'calc tile size: {tile_size}') - res = self.split_to_tiles(frame, path, tile_size, ai_config.big_image_tile_overlap_percent) - frame_data.extend(res) + frame_data.extend(tiles) + if len(frame_data) > self.engine.get_batch_size(): - for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): - self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance) + self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False) self.send_detection_status(cmd.client_id) - for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): - self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance) + self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False) self.send_detection_status(cmd.client_id) cdef send_detection_status(self, client_id): @@ -347,47 +325,67 @@ cdef class Inference: pass self.detection_counts.clear() - cdef split_to_tiles(self, frame, path, tile_size, overlap_percent): - constants_inf.log(f'splitting image {path} to tiles...') + cdef prepare_frame_or_tiles(self, frame, str original_media_name, int timestamp, AIRecognitionConfig ai_config): + img_h, img_w, _ = frame.shape + ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w) + + if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width: + name = f'{original_media_name}_{constants_inf.format_time(timestamp)}' + return [(frame, original_media_name, name, timestamp)], ground_sampling_distance + else: + tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance) + tiles = self.split_to_tiles(frame, original_media_name, timestamp, tile_size, ai_config.big_image_tile_overlap_percent) + return tiles, ground_sampling_distance + + cdef process_frame_data_batches(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video): + for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): + self._process_batch(cmd, ai_config, chunk, ground_sampling_distance, is_video) + + cdef split_to_tiles(self, frame, str original_media_name, int timestamp, tile_size, overlap_percent): img_h, img_w, _ = frame.shape stride_w = int(tile_size * (1 - overlap_percent / 100)) stride_h = int(tile_size * (1 - overlap_percent / 100)) results = [] - original_media_name = Path( path).stem.replace(" ", "") for y in range(0, img_h, stride_h): for x in range(0, img_w, stride_w): x_end = min(x + tile_size, img_w) y_end = min(y + tile_size, img_h) - # correct x,y for the close-to-border tiles if x_end - x < tile_size: if img_w - (x - stride_w) <= tile_size: - continue # the previous tile already covered the last gap + continue x = img_w - tile_size if y_end - y < tile_size: if img_h - (y - stride_h) <= tile_size: - continue # the previous tile already covered the last gap + continue y = img_h - tile_size tile = frame[y:y_end, x:x_end] - name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_000000' - results.append((tile, original_media_name, name)) + name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_{constants_inf.format_time(timestamp)}' + results.append((tile, original_media_name, name, timestamp)) return results - cdef _process_images_inner(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance): - cdef list frames, original_media_names, names + cdef _process_batch(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video): + cdef list frames, original_media_names, names, timestamps cdef Annotation annotation cdef int i - frames, original_media_names, names = map(list, zip(*frame_data)) + + frames, original_media_names, names, timestamps = map(list, zip(*frame_data)) input_blob = self.preprocess(frames) outputs = self.engine.run(input_blob) list_detections = self.postprocess(outputs, ai_config) for i in range(len(list_detections)): - annotation = Annotation(names[i], original_media_names[i], 0, list_detections[i]) - if self.is_valid_image_annotation(annotation, ground_sampling_distance, frames[i].shape): + annotation = Annotation(names[i], original_media_names[i], timestamps[i], list_detections[i]) + + if self.check_valid_sizes(annotation, ground_sampling_distance, frames[i].shape): + if is_video: + if not self.is_valid_video_annotation(annotation, ai_config): + continue + self._previous_annotation = annotation + constants_inf.log( f'Detected {annotation}') _, image = cv2.imencode('.jpg', frames[i]) annotation.image = image.tobytes() @@ -423,7 +421,7 @@ cdef class Inference: annotation.detections = unique_detections - cdef bint is_valid_image_annotation(self, Annotation annotation, double ground_sampling_distance, frame_shape): + cdef bint check_valid_sizes(self, Annotation annotation, double ground_sampling_distance, frame_shape): if constants_inf.SPLIT_SUFFIX in annotation.name: self.remove_tiled_duplicates(annotation) img_h, img_w, _ = frame_shape @@ -442,7 +440,6 @@ cdef class Inference: else: constants_inf.log( f'Removed ({m_w} {m_h}) > {max_size}. class: {constants_inf.annotations_dict[det.cls].name}') - # Replace the old list with the new, filtered one annotation.detections = valid_detections if not annotation.detections: @@ -450,8 +447,6 @@ cdef class Inference: return True cdef bint is_valid_video_annotation(self, Annotation annotation, AIRecognitionConfig ai_config): - if constants_inf.SPLIT_SUFFIX in annotation.name: - self.remove_tiled_duplicates(annotation) if not annotation.detections: return False