[AZ-153] [AZ-155] [AZ-156] [AZ-158] Add augmentation, dataset formation, label validation, model split tests

Made-with: Cursor
2026-06-22 08:41:11 +00:00 · 2026-03-26 23:18:17 +02:00
parent 66fe1cc918
commit 41552c5699
7 changed files with 690 additions and 0 deletions
@@ -0,0 +1,244 @@
+import shutil
+import sys
+import types
+from os import path as osp
+from pathlib import Path
+
+import pytest
+
+import constants as c_mod
+
+
+def _stub_train_dependencies():
+    if getattr(_stub_train_dependencies, "_done", False):
+        return
+
+    def add_mod(name):
+        if name in sys.modules:
+            return sys.modules[name]
+        m = types.ModuleType(name)
+        sys.modules[name] = m
+        return m
+
+    ultra = add_mod("ultralytics")
+
+    class YOLO:
+        pass
+
+    ultra.YOLO = YOLO
+
+    def fake_client(*_a, **_k):
+        return types.SimpleNamespace(
+            upload_fileobj=lambda *_a, **_k: None,
+            download_file=lambda *_a, **_k: None,
+        )
+
+    boto = add_mod("boto3")
+    boto.client = fake_client
+    add_mod("netron")
+    add_mod("requests")
+    _stub_train_dependencies._done = True
+
+
+_stub_train_dependencies()
+
+
+def _prepare_form_dataset(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+    count,
+    corrupt_stems,
+):
+    constants_patch(tmp_path)
+    import train
+
+    proc_img = Path(c_mod.processed_images_dir)
+    proc_lbl = Path(c_mod.processed_labels_dir)
+    proc_img.mkdir(parents=True, exist_ok=True)
+    proc_lbl.mkdir(parents=True, exist_ok=True)
+
+    imgs = sorted(fixture_images_dir.glob("*.jpg"))[:count]
+    for p in imgs:
+        stem = p.stem
+        shutil.copy2(fixture_images_dir / f"{stem}.jpg", proc_img / f"{stem}.jpg")
+        dst = proc_lbl / f"{stem}.txt"
+        shutil.copy2(fixture_labels_dir / f"{stem}.txt", dst)
+        if stem in corrupt_stems:
+            dst.write_text("0 1.5 0.5 0.1 0.1\n", encoding="utf-8")
+
+    today_ds = osp.join(c_mod.datasets_dir, train.today_folder)
+    monkeypatch.setattr(train, "today_dataset", today_ds)
+    monkeypatch.setattr(train, "processed_images_dir", c_mod.processed_images_dir)
+    monkeypatch.setattr(train, "processed_labels_dir", c_mod.processed_labels_dir)
+    monkeypatch.setattr(train, "corrupted_images_dir", c_mod.corrupted_images_dir)
+    monkeypatch.setattr(train, "corrupted_labels_dir", c_mod.corrupted_labels_dir)
+    monkeypatch.setattr(train, "datasets_dir", c_mod.datasets_dir)
+    return train
+
+
+def _count_jpg(p):
+    return len(list(Path(p).glob("*.jpg")))
+
+
+def test_bt_dsf_01_split_ratio_70_20_10(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        100,
+        set(),
+    )
+    train.form_dataset()
+    base = train.today_dataset
+    assert _count_jpg(Path(base, "train", "images")) == 70
+    assert _count_jpg(Path(base, "valid", "images")) == 20
+    assert _count_jpg(Path(base, "test", "images")) == 10
+
+
+def test_bt_dsf_02_six_subdirectories(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        100,
+        set(),
+    )
+    train.form_dataset()
+    base = Path(train.today_dataset)
+    assert (base / "train" / "images").is_dir()
+    assert (base / "train" / "labels").is_dir()
+    assert (base / "valid" / "images").is_dir()
+    assert (base / "valid" / "labels").is_dir()
+    assert (base / "test" / "images").is_dir()
+    assert (base / "test" / "labels").is_dir()
+
+
+def test_bt_dsf_03_total_files_one_hundred(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        100,
+        set(),
+    )
+    train.form_dataset()
+    base = train.today_dataset
+    n = (
+        _count_jpg(Path(base, "train", "images"))
+        + _count_jpg(Path(base, "valid", "images"))
+        + _count_jpg(Path(base, "test", "images"))
+    )
+    assert n == 100
+
+
+def test_bt_dsf_04_corrupted_labels_quarantined(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    stems = [p.stem for p in sorted(fixture_images_dir.glob("*.jpg"))[:100]]
+    corrupt = set(stems[:5])
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        100,
+        corrupt,
+    )
+    train.form_dataset()
+    base = train.today_dataset
+    split_total = (
+        _count_jpg(Path(base, "train", "images"))
+        + _count_jpg(Path(base, "valid", "images"))
+        + _count_jpg(Path(base, "test", "images"))
+    )
+    assert split_total == 95
+    assert _count_jpg(c_mod.corrupted_images_dir) == 5
+    assert len(list(Path(c_mod.corrupted_labels_dir).glob("*.txt"))) == 5
+
+
+@pytest.mark.resilience
+def test_rt_dsf_01_empty_processed_no_crash(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        0,
+        set(),
+    )
+    train.form_dataset()
+    base = Path(train.today_dataset)
+    assert base.is_dir()
+
+
+@pytest.mark.resource_limit
+def test_rl_dsf_01_split_ratios_sum_hundred():
+    import train
+
+    assert train.train_set + train.valid_set + train.test_set == 100
+
+
+@pytest.mark.resource_limit
+def test_rl_dsf_02_no_filename_duplication_across_splits(
+    monkeypatch,
+    tmp_path,
+    constants_patch,
+    fixture_images_dir,
+    fixture_labels_dir,
+):
+    train = _prepare_form_dataset(
+        monkeypatch,
+        tmp_path,
+        constants_patch,
+        fixture_images_dir,
+        fixture_labels_dir,
+        100,
+        set(),
+    )
+    train.form_dataset()
+    base = Path(train.today_dataset)
+    names = []
+    for split in ("train", "valid", "test"):
+        for f in (base / split / "images").glob("*.jpg"):
+            names.append(f.name)
+    assert len(names) == len(set(names))
+    assert len(names) == 100