remove unnecessary middle epochs

2026-06-21 06:01:12 +00:00 · 2025-05-31 19:02:57 +03:00
parent 80c2433141
commit 44c9e87bd4
2 changed files with 19 additions and 12 deletions
@@ -1,8 +1,10 @@
+import glob
+import os
 import shutil
 from os import path

+import constants
 import train
-from constants import models_dir, prefix
 from augmentation import Augmentator

 # Augmentator().augment_annotations()
@@ -10,11 +12,12 @@ from augmentation import Augmentator
 # train.resume_training('/azaion/dev/ai-training/runs/detect/train12/weights/last.pt')

 result_dir = '/azaion/dev/ai-training/runs/detect/train12'
-model_dir = path.join(models_dir, f'{prefix}2025-05-18')
-shutil.copytree(result_dir, model_dir, dirs_exist_ok=True)
+model_dir = path.join(constants.models_dir, f'{constants.prefix}2025-05-18')

-model_path = path.join(models_dir, f'{prefix[:-1]}.pt')
-shutil.copy(path.join(model_dir, 'weights', 'best.pt'), model_path)
+shutil.copytree(result_dir, model_dir, dirs_exist_ok=True)
+for file in glob.glob(path.join(model_dir, 'weights', 'epoch*')):
+    os.remove(file)
+shutil.copy(path.join(model_dir, 'weights', 'best.pt'), constants.CURRENT_PT_MODEL)

 train.export_current_model()
 print('success!')
@@ -1,4 +1,5 @@
 import concurrent.futures
+import glob
 import os
 import random
 import shutil
@@ -146,18 +147,21 @@ def resume_training(last_pt_path):
 def train_dataset():
    form_dataset()
    create_yaml()
-    model_name = 'yolo11m.yaml'
-    model = YOLO(model_name)
+    model = YOLO('yolo11m.yaml')

    results = model.train(data=abspath(path.join(today_dataset, 'data.yaml')),
-                           epochs=120,
-                           batch=11,
-                           imgsz=1280,
-                           save_period=1,
-                           workers=24)
+                           epochs=120,    # Empirically set for good performance and relatively not so long training
+                                          # (360k of annotations on 1 RTX4090 takes 11.5 days of training :( )
+                           batch=11,      # reflects current GPU memory, 24Gb (batch 11 gets ~22Gb, batch 12 fails on 24.2Gb)
+                           imgsz=1280,    # 1280p is a tradeoff between quality and speed
+                           save_period=1, # for resuming in case of power outages / other issues
+                           workers=24)    # loading data workers. Bound to cpus count

    model_dir = path.join(models_dir, today_folder)
+
    shutil.copytree(results.save_dir, model_dir)
+    for file in glob.glob(path.join(model_dir, 'weights', 'epoch*')): # remove unnecessary middle epochs
+        os.remove(file)
    shutil.copy(path.join(model_dir, 'weights', 'best.pt'), constants.CURRENT_PT_MODEL)