diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx index 39fe2fe..15b9bb9 100644 --- a/Azaion.Inference/inference.pyx +++ b/Azaion.Inference/inference.pyx @@ -258,13 +258,13 @@ cdef class Inference: cdef _process_video(self, RemoteCommand cmd, AIRecognitionConfig ai_config, str video_name): cdef int frame_count = 0 - cdef list frame_data = [] + cdef list batch_frames = [] + cdef list[int] batch_timestamps = [] + cdef Annotation annotation self._previous_annotation = None - self._tile_detections = {} - original_media_name = Path(video_name).stem.replace(" ", "") + v_input = cv2.VideoCapture(video_name) - while v_input.isOpened() and not self.stop_signal: ret, frame = v_input.read() if not ret or frame is None: @@ -272,17 +272,30 @@ cdef class Inference: frame_count += 1 if frame_count % ai_config.frame_period_recognition == 0: - timestamp = int(v_input.get(cv2.CAP_PROP_POS_MSEC)) - tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, timestamp, ai_config) - frame_data.extend(tiles) + batch_frames.append(frame) + batch_timestamps.append(int(v_input.get(cv2.CAP_PROP_POS_MSEC))) - if len(frame_data) >= self.engine.get_batch_size(): - self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True) - frame_data.clear() + if len(batch_frames) == self.engine.get_batch_size(): + input_blob = self.preprocess(batch_frames) - if frame_data: - self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, True) + outputs = self.engine.run(input_blob) + list_detections = self.postprocess(outputs, ai_config) + for i in range(len(list_detections)): + detections = list_detections[i] + + original_media_name = Path(video_name).stem.replace(" ", "") + name = f'{original_media_name}_{constants_inf.format_time(batch_timestamps[i])}' + annotation = Annotation(name, original_media_name, batch_timestamps[i], detections) + + if self.is_valid_video_annotation(annotation, ai_config): + _, image = cv2.imencode('.jpg', batch_frames[i]) + annotation.image = image.tobytes() + self._previous_annotation = annotation + self.on_annotation(cmd, annotation) + + batch_frames.clear() + batch_timestamps.clear() v_input.release() self.send_detection_status(cmd.client_id) @@ -297,20 +310,29 @@ cdef class Inference: for path in image_paths: frame_data = [] frame = cv2.imread(path) + img_h, img_w, _ = frame.shape if frame is None: constants_inf.logerror(f'Failed to read image {path}') continue original_media_name = Path( path).stem.replace(" ", "") - tiles, ground_sampling_distance = self.prepare_frame_or_tiles(frame, original_media_name, 0, ai_config) + ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w) constants_inf.log(f'ground sampling distance: {ground_sampling_distance}') - frame_data.extend(tiles) - + + if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width: + frame_data.append((frame, original_media_name, f'{original_media_name}_000000')) + else: + tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance) + constants_inf.log( f'calc tile size: {tile_size}') + res = self.split_to_tiles(frame, path, tile_size, ai_config.big_image_tile_overlap_percent) + frame_data.extend(res) if len(frame_data) > self.engine.get_batch_size(): - self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False) + for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): + self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance) self.send_detection_status(cmd.client_id) - self.process_frame_data_batches(cmd, ai_config, frame_data, ground_sampling_distance, False) + for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): + self._process_images_inner(cmd, ai_config, chunk, ground_sampling_distance) self.send_detection_status(cmd.client_id) cdef send_detection_status(self, client_id): @@ -325,67 +347,47 @@ cdef class Inference: pass self.detection_counts.clear() - cdef prepare_frame_or_tiles(self, frame, str original_media_name, int timestamp, AIRecognitionConfig ai_config): - img_h, img_w, _ = frame.shape - ground_sampling_distance = ai_config.sensor_width * ai_config.altitude / (ai_config.focal_length * img_w) - - if img_h <= 1.5 * self.model_height and img_w <= 1.5 * self.model_width: - name = f'{original_media_name}_{constants_inf.format_time(timestamp)}' - return [(frame, original_media_name, name, timestamp)], ground_sampling_distance - else: - tile_size = int(constants_inf.METERS_IN_TILE / ground_sampling_distance) - tiles = self.split_to_tiles(frame, original_media_name, timestamp, tile_size, ai_config.big_image_tile_overlap_percent) - return tiles, ground_sampling_distance - - cdef process_frame_data_batches(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video): - for chunk in self.split_list_extend(frame_data, self.engine.get_batch_size()): - self._process_batch(cmd, ai_config, chunk, ground_sampling_distance, is_video) - - cdef split_to_tiles(self, frame, str original_media_name, int timestamp, tile_size, overlap_percent): + cdef split_to_tiles(self, frame, path, tile_size, overlap_percent): + constants_inf.log(f'splitting image {path} to tiles...') img_h, img_w, _ = frame.shape stride_w = int(tile_size * (1 - overlap_percent / 100)) stride_h = int(tile_size * (1 - overlap_percent / 100)) results = [] + original_media_name = Path( path).stem.replace(" ", "") for y in range(0, img_h, stride_h): for x in range(0, img_w, stride_w): x_end = min(x + tile_size, img_w) y_end = min(y + tile_size, img_h) + # correct x,y for the close-to-border tiles if x_end - x < tile_size: if img_w - (x - stride_w) <= tile_size: - continue + continue # the previous tile already covered the last gap x = img_w - tile_size if y_end - y < tile_size: if img_h - (y - stride_h) <= tile_size: - continue + continue # the previous tile already covered the last gap y = img_h - tile_size tile = frame[y:y_end, x:x_end] - name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_{constants_inf.format_time(timestamp)}' - results.append((tile, original_media_name, name, timestamp)) + name = f'{original_media_name}{constants_inf.SPLIT_SUFFIX}{tile_size:04d}_{x:04d}_{y:04d}!_000000' + results.append((tile, original_media_name, name)) return results - cdef _process_batch(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance, bint is_video): - cdef list frames, original_media_names, names, timestamps + cdef _process_images_inner(self, RemoteCommand cmd, AIRecognitionConfig ai_config, list frame_data, double ground_sampling_distance): + cdef list frames, original_media_names, names cdef Annotation annotation cdef int i - - frames, original_media_names, names, timestamps = map(list, zip(*frame_data)) + frames, original_media_names, names = map(list, zip(*frame_data)) input_blob = self.preprocess(frames) outputs = self.engine.run(input_blob) list_detections = self.postprocess(outputs, ai_config) for i in range(len(list_detections)): - annotation = Annotation(names[i], original_media_names[i], timestamps[i], list_detections[i]) - - if self.check_valid_sizes(annotation, ground_sampling_distance, frames[i].shape): - if is_video: - if not self.is_valid_video_annotation(annotation, ai_config): - continue - self._previous_annotation = annotation - + annotation = Annotation(names[i], original_media_names[i], 0, list_detections[i]) + if self.is_valid_image_annotation(annotation, ground_sampling_distance, frames[i].shape): constants_inf.log( f'Detected {annotation}') _, image = cv2.imencode('.jpg', frames[i]) annotation.image = image.tobytes() @@ -421,7 +423,7 @@ cdef class Inference: annotation.detections = unique_detections - cdef bint check_valid_sizes(self, Annotation annotation, double ground_sampling_distance, frame_shape): + cdef bint is_valid_image_annotation(self, Annotation annotation, double ground_sampling_distance, frame_shape): if constants_inf.SPLIT_SUFFIX in annotation.name: self.remove_tiled_duplicates(annotation) img_h, img_w, _ = frame_shape @@ -440,6 +442,7 @@ cdef class Inference: else: constants_inf.log( f'Removed ({m_w} {m_h}) > {max_size}. class: {constants_inf.annotations_dict[det.cls].name}') + # Replace the old list with the new, filtered one annotation.detections = valid_detections if not annotation.detections: @@ -447,6 +450,8 @@ cdef class Inference: return True cdef bint is_valid_video_annotation(self, Annotation annotation, AIRecognitionConfig ai_config): + if constants_inf.SPLIT_SUFFIX in annotation.name: + self.remove_tiled_duplicates(annotation) if not annotation.detections: return False