From b3db108f59db738a30b1dae5bed62a16092bab0f Mon Sep 17 00:00:00 2001
From: Alex Bezdieniezhnykh <zxsanny@gmail.com>
Date: Wed, 30 Apr 2025 21:43:36 +0300
Subject: [PATCH 1/5] add missing packages to build script

---
 Azaion.Inference/azaion-inference.spec | 6 +++++-
 Azaion.Inference/build.cmd             | 2 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Azaion.Inference/azaion-inference.spec b/Azaion.Inference/azaion-inference.spec
index 80ecf15..5a83791 100644
--- a/Azaion.Inference/azaion-inference.spec
+++ b/Azaion.Inference/azaion-inference.spec
@@ -4,7 +4,7 @@ from PyInstaller.utils.hooks import collect_all
 
 datas = [('venv\\Lib\\site-packages\\cv2', 'cv2')]
 binaries = []
-hiddenimports = ['constants', 'annotation', 'credentials', 'file_data', 'user', 'security', 'secure_model', 'cdn_manager', 'api_client', 'hardware_service', 'remote_command', 'ai_config', 'inference_engine', 'inference', 'remote_command_handler']
+hiddenimports = ['constants', 'annotation', 'credentials', 'file_data', 'user', 'security', 'secure_model', 'cdn_manager', 'api_client', 'hardware_service', 'remote_command', 'ai_config', 'tensorrt_engine', 'onnx_engine', 'inference_engine', 'inference', 'remote_command_handler']
 hiddenimports += collect_submodules('cv2')
 tmp_ret = collect_all('requests')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
@@ -28,6 +28,10 @@ tmp_ret = collect_all('pynvml')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 tmp_ret = collect_all('boto3')
 datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('re')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
+tmp_ret = collect_all('jwt')
+datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
 
 
 a = Analysis(
diff --git a/Azaion.Inference/build.cmd b/Azaion.Inference/build.cmd
index ee58445..6b451e1 100644
--- a/Azaion.Inference/build.cmd
+++ b/Azaion.Inference/build.cmd
@@ -44,6 +44,8 @@ venv\Scripts\pyinstaller --name=azaion-inference ^
 --hidden-import hardware_service ^
 --hidden-import remote_command ^
 --hidden-import ai_config ^
+--hidden-import tensorrt_engine ^
+--hidden-import onnx_engine ^
 --hidden-import inference_engine ^
 --hidden-import inference ^
 --hidden-import remote_command_handler ^

From ae83bc8542cf1b9c712a4aba8b904d723c4f8803 Mon Sep 17 00:00:00 2001
From: Alex Bezdieniezhnykh <zxsanny@gmail.com>
Date: Wed, 30 Apr 2025 22:19:42 +0300
Subject: [PATCH 2/5] fix checking nvidia gpu

---
 Azaion.Inference/hardware_service.pyx | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/Azaion.Inference/hardware_service.pyx b/Azaion.Inference/hardware_service.pyx
index 7a1e97c..9782b21 100644
--- a/Azaion.Inference/hardware_service.pyx
+++ b/Azaion.Inference/hardware_service.pyx
@@ -1,6 +1,7 @@
 import re
 import subprocess
 import psutil
+import pynvml
 
 cdef class HardwareInfo:
     def __init__(self, str cpu, str gpu, str memory, str mac_address):
@@ -46,14 +47,25 @@ cdef class HardwareService:
     @staticmethod
     cdef has_nvidia_gpu():
         try:
-            output = subprocess.check_output(['nvidia-smi']).decode()
-            match = re.search(r'CUDA Version:\s*([\d.]+)', output)
-            if match:
-                return float(match.group(1)) > 11
-            return False
-        except Exception as e:
-            print(e)
+            pynvml.nvmlInit()
+            device_count = pynvml.nvmlDeviceGetCount()
+
+            if device_count > 0:
+                print(f"Found NVIDIA GPU(s).")
+                return True
+            else:
+                print("No NVIDIA GPUs found by NVML.")
+                return False
+
+        except pynvml.NVMLError as error:
+            print(f"Failed to find NVIDIA GPU")
             return False
+        finally:
+            try:
+                pynvml.nvmlShutdown()
+            except:
+                print('Failed to shutdown pynvml cause probably no NVidia GPU')
+                pass
 
     cdef HardwareInfo get_hardware_info(self):
         if self.is_windows:

From 1c4bdabfb51a8462d20e98af7ed62748d8b8e0e8 Mon Sep 17 00:00:00 2001
From: Alex Bezdieniezhnykh <zxsanny@gmail.com>
Date: Wed, 30 Apr 2025 23:08:53 +0300
Subject: [PATCH 3/5] import Tensorrt not in compile time in order to
 dynamically load tensorrt only if nvidia gpu is present

---
 Azaion.Inference/inference.pyx       |  2 +-
 Azaion.Inference/onnx_engine.pyx     |  4 ++--
 Azaion.Inference/tensorrt_engine.pxd | 14 +++-----------
 Azaion.Inference/tensorrt_engine.pyx | 12 ++++++------
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx
index d505e60..48925de 100644
--- a/Azaion.Inference/inference.pyx
+++ b/Azaion.Inference/inference.pyx
@@ -16,7 +16,7 @@ from hardware_service cimport HardwareService
 from security cimport Security
 
 if HardwareService.has_nvidia_gpu():
-    from tensorrt_engine cimport TensorRTEngine
+    from tensorrt_engine import TensorRTEngine
 else:
     from onnx_engine import OnnxEngine
 
diff --git a/Azaion.Inference/onnx_engine.pyx b/Azaion.Inference/onnx_engine.pyx
index ed8cdeb..de68d73 100644
--- a/Azaion.Inference/onnx_engine.pyx
+++ b/Azaion.Inference/onnx_engine.pyx
@@ -14,11 +14,11 @@ cdef class OnnxEngine(InferenceEngine):
         model_meta = self.session.get_modelmeta()
         print("Metadata:", model_meta.custom_metadata_map)
 
-    cdef tuple get_input_shape(self):
+    cpdef tuple get_input_shape(self):
         shape = self.input_shape
         return shape[2], shape[3]
 
-    cdef int get_batch_size(self):
+    cpdef int get_batch_size(self):
         return self.batch_size
 
     cpdef run(self, input_data):
diff --git a/Azaion.Inference/tensorrt_engine.pxd b/Azaion.Inference/tensorrt_engine.pxd
index 90d8b79..6fc31bd 100644
--- a/Azaion.Inference/tensorrt_engine.pxd
+++ b/Azaion.Inference/tensorrt_engine.pxd
@@ -16,17 +16,9 @@ cdef class TensorRTEngine(InferenceEngine):
 
     cdef object stream
 
-    @staticmethod
-    cdef get_gpu_memory_bytes(int device_id)
 
-    @staticmethod
-    cdef get_engine_filename(int device_id)
+    cpdef tuple get_input_shape(self)
 
-    @staticmethod
-    cdef convert_from_onnx(bytes onnx_model)
+    cpdef int get_batch_size(self)
 
-    cdef tuple get_input_shape(self)
-
-    cdef int get_batch_size(self)
-
-    cdef run(self, input_data)
+    cpdef run(self, input_data)
diff --git a/Azaion.Inference/tensorrt_engine.pyx b/Azaion.Inference/tensorrt_engine.pyx
index c7cf1a5..d8c4189 100644
--- a/Azaion.Inference/tensorrt_engine.pyx
+++ b/Azaion.Inference/tensorrt_engine.pyx
@@ -56,7 +56,7 @@ cdef class TensorRTEngine(InferenceEngine):
             raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
 
     @staticmethod
-    cdef get_gpu_memory_bytes(int device_id):
+    def get_gpu_memory_bytes(int device_id):
         total_memory = None
         try:
             pynvml.nvmlInit()
@@ -73,7 +73,7 @@ cdef class TensorRTEngine(InferenceEngine):
         return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
 
     @staticmethod
-    cdef get_engine_filename(int device_id):
+    def get_engine_filename(int device_id):
         try:
             device = cuda.Device(device_id)
             sm_count = device.multiprocessor_count
@@ -83,7 +83,7 @@ cdef class TensorRTEngine(InferenceEngine):
             return None
 
     @staticmethod
-    cdef convert_from_onnx(bytes onnx_model):
+    def convert_from_onnx(bytes onnx_model):
         workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9)
 
         explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -112,13 +112,13 @@ cdef class TensorRTEngine(InferenceEngine):
             constants.log('conversion done!')
             return bytes(plan)
 
-    cdef tuple get_input_shape(self):
+    cpdef tuple get_input_shape(self):
         return self.input_shape[2], self.input_shape[3]
 
-    cdef int get_batch_size(self):
+    cpdef int get_batch_size(self):
         return self.batch_size
 
-    cdef run(self, input_data):
+    cpdef run(self, input_data):
         try:
             cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
             self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer

From cf01e5d95212062e35d78982a73f0fc9b11444ea Mon Sep 17 00:00:00 2001
From: Alex Bezdieniezhnykh <zxsanny@gmail.com>
Date: Wed, 30 Apr 2025 23:32:03 +0300
Subject: [PATCH 4/5] Revert "import Tensorrt not in compile time in order to
 dynamically load tensorrt only if nvidia gpu is present"

This reverts commit 1c4bdabfb51a8462d20e98af7ed62748d8b8e0e8.
---
 Azaion.Inference/inference.pyx       |  2 +-
 Azaion.Inference/onnx_engine.pyx     |  4 ++--
 Azaion.Inference/tensorrt_engine.pxd | 14 +++++++++++---
 Azaion.Inference/tensorrt_engine.pyx | 12 ++++++------
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx
index 48925de..d505e60 100644
--- a/Azaion.Inference/inference.pyx
+++ b/Azaion.Inference/inference.pyx
@@ -16,7 +16,7 @@ from hardware_service cimport HardwareService
 from security cimport Security
 
 if HardwareService.has_nvidia_gpu():
-    from tensorrt_engine import TensorRTEngine
+    from tensorrt_engine cimport TensorRTEngine
 else:
     from onnx_engine import OnnxEngine
 
diff --git a/Azaion.Inference/onnx_engine.pyx b/Azaion.Inference/onnx_engine.pyx
index de68d73..ed8cdeb 100644
--- a/Azaion.Inference/onnx_engine.pyx
+++ b/Azaion.Inference/onnx_engine.pyx
@@ -14,11 +14,11 @@ cdef class OnnxEngine(InferenceEngine):
         model_meta = self.session.get_modelmeta()
         print("Metadata:", model_meta.custom_metadata_map)
 
-    cpdef tuple get_input_shape(self):
+    cdef tuple get_input_shape(self):
         shape = self.input_shape
         return shape[2], shape[3]
 
-    cpdef int get_batch_size(self):
+    cdef int get_batch_size(self):
         return self.batch_size
 
     cpdef run(self, input_data):
diff --git a/Azaion.Inference/tensorrt_engine.pxd b/Azaion.Inference/tensorrt_engine.pxd
index 6fc31bd..90d8b79 100644
--- a/Azaion.Inference/tensorrt_engine.pxd
+++ b/Azaion.Inference/tensorrt_engine.pxd
@@ -16,9 +16,17 @@ cdef class TensorRTEngine(InferenceEngine):
 
     cdef object stream
 
+    @staticmethod
+    cdef get_gpu_memory_bytes(int device_id)
 
-    cpdef tuple get_input_shape(self)
+    @staticmethod
+    cdef get_engine_filename(int device_id)
 
-    cpdef int get_batch_size(self)
+    @staticmethod
+    cdef convert_from_onnx(bytes onnx_model)
 
-    cpdef run(self, input_data)
+    cdef tuple get_input_shape(self)
+
+    cdef int get_batch_size(self)
+
+    cdef run(self, input_data)
diff --git a/Azaion.Inference/tensorrt_engine.pyx b/Azaion.Inference/tensorrt_engine.pyx
index d8c4189..c7cf1a5 100644
--- a/Azaion.Inference/tensorrt_engine.pyx
+++ b/Azaion.Inference/tensorrt_engine.pyx
@@ -56,7 +56,7 @@ cdef class TensorRTEngine(InferenceEngine):
             raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
 
     @staticmethod
-    def get_gpu_memory_bytes(int device_id):
+    cdef get_gpu_memory_bytes(int device_id):
         total_memory = None
         try:
             pynvml.nvmlInit()
@@ -73,7 +73,7 @@ cdef class TensorRTEngine(InferenceEngine):
         return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
 
     @staticmethod
-    def get_engine_filename(int device_id):
+    cdef get_engine_filename(int device_id):
         try:
             device = cuda.Device(device_id)
             sm_count = device.multiprocessor_count
@@ -83,7 +83,7 @@ cdef class TensorRTEngine(InferenceEngine):
             return None
 
     @staticmethod
-    def convert_from_onnx(bytes onnx_model):
+    cdef convert_from_onnx(bytes onnx_model):
         workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9)
 
         explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -112,13 +112,13 @@ cdef class TensorRTEngine(InferenceEngine):
             constants.log('conversion done!')
             return bytes(plan)
 
-    cpdef tuple get_input_shape(self):
+    cdef tuple get_input_shape(self):
         return self.input_shape[2], self.input_shape[3]
 
-    cpdef int get_batch_size(self):
+    cdef int get_batch_size(self):
         return self.batch_size
 
-    cpdef run(self, input_data):
+    cdef run(self, input_data):
         try:
             cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
             self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer

From 28069f63f991e22a3b9b36184629e17e9e7303bc Mon Sep 17 00:00:00 2001
From: Alex Bezdieniezhnykh <zxsanny@gmail.com>
Date: Wed, 30 Apr 2025 23:47:46 +0300
Subject: [PATCH 5/5] Reapply "import Tensorrt not in compile time in order to
 dynamically load tensorrt only if nvidia gpu is present"

This reverts commit cf01e5d95212062e35d78982a73f0fc9b11444ea.
---
 Azaion.Inference/inference.pyx       |  2 +-
 Azaion.Inference/onnx_engine.pyx     |  4 ++--
 Azaion.Inference/tensorrt_engine.pxd | 14 +++-----------
 Azaion.Inference/tensorrt_engine.pyx | 12 ++++++------
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/Azaion.Inference/inference.pyx b/Azaion.Inference/inference.pyx
index d505e60..48925de 100644
--- a/Azaion.Inference/inference.pyx
+++ b/Azaion.Inference/inference.pyx
@@ -16,7 +16,7 @@ from hardware_service cimport HardwareService
 from security cimport Security
 
 if HardwareService.has_nvidia_gpu():
-    from tensorrt_engine cimport TensorRTEngine
+    from tensorrt_engine import TensorRTEngine
 else:
     from onnx_engine import OnnxEngine
 
diff --git a/Azaion.Inference/onnx_engine.pyx b/Azaion.Inference/onnx_engine.pyx
index ed8cdeb..de68d73 100644
--- a/Azaion.Inference/onnx_engine.pyx
+++ b/Azaion.Inference/onnx_engine.pyx
@@ -14,11 +14,11 @@ cdef class OnnxEngine(InferenceEngine):
         model_meta = self.session.get_modelmeta()
         print("Metadata:", model_meta.custom_metadata_map)
 
-    cdef tuple get_input_shape(self):
+    cpdef tuple get_input_shape(self):
         shape = self.input_shape
         return shape[2], shape[3]
 
-    cdef int get_batch_size(self):
+    cpdef int get_batch_size(self):
         return self.batch_size
 
     cpdef run(self, input_data):
diff --git a/Azaion.Inference/tensorrt_engine.pxd b/Azaion.Inference/tensorrt_engine.pxd
index 90d8b79..6fc31bd 100644
--- a/Azaion.Inference/tensorrt_engine.pxd
+++ b/Azaion.Inference/tensorrt_engine.pxd
@@ -16,17 +16,9 @@ cdef class TensorRTEngine(InferenceEngine):
 
     cdef object stream
 
-    @staticmethod
-    cdef get_gpu_memory_bytes(int device_id)
 
-    @staticmethod
-    cdef get_engine_filename(int device_id)
+    cpdef tuple get_input_shape(self)
 
-    @staticmethod
-    cdef convert_from_onnx(bytes onnx_model)
+    cpdef int get_batch_size(self)
 
-    cdef tuple get_input_shape(self)
-
-    cdef int get_batch_size(self)
-
-    cdef run(self, input_data)
+    cpdef run(self, input_data)
diff --git a/Azaion.Inference/tensorrt_engine.pyx b/Azaion.Inference/tensorrt_engine.pyx
index c7cf1a5..d8c4189 100644
--- a/Azaion.Inference/tensorrt_engine.pyx
+++ b/Azaion.Inference/tensorrt_engine.pyx
@@ -56,7 +56,7 @@ cdef class TensorRTEngine(InferenceEngine):
             raise RuntimeError(f"Failed to initialize TensorRT engine: {str(e)}")
 
     @staticmethod
-    cdef get_gpu_memory_bytes(int device_id):
+    def get_gpu_memory_bytes(int device_id):
         total_memory = None
         try:
             pynvml.nvmlInit()
@@ -73,7 +73,7 @@ cdef class TensorRTEngine(InferenceEngine):
         return 2 * 1024 * 1024 * 1024 if total_memory is None else total_memory # default 2 Gb
 
     @staticmethod
-    cdef get_engine_filename(int device_id):
+    def get_engine_filename(int device_id):
         try:
             device = cuda.Device(device_id)
             sm_count = device.multiprocessor_count
@@ -83,7 +83,7 @@ cdef class TensorRTEngine(InferenceEngine):
             return None
 
     @staticmethod
-    cdef convert_from_onnx(bytes onnx_model):
+    def convert_from_onnx(bytes onnx_model):
         workspace_bytes = int(TensorRTEngine.get_gpu_memory_bytes(0) * 0.9)
 
         explicit_batch_flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
@@ -112,13 +112,13 @@ cdef class TensorRTEngine(InferenceEngine):
             constants.log('conversion done!')
             return bytes(plan)
 
-    cdef tuple get_input_shape(self):
+    cpdef tuple get_input_shape(self):
         return self.input_shape[2], self.input_shape[3]
 
-    cdef int get_batch_size(self):
+    cpdef int get_batch_size(self):
         return self.batch_size
 
-    cdef run(self, input_data):
+    cpdef run(self, input_data):
         try:
             cuda.memcpy_htod_async(self.d_input, input_data, self.stream)
             self.context.set_tensor_address(self.input_name, int(self.d_input))  # input buffer