mirror of
https://github.com/azaion/ai-training.git
synced 2026-04-22 11:16:35 +00:00
[AZ-153] [AZ-155] [AZ-156] [AZ-158] Add augmentation, dataset formation, label validation, model split tests
Made-with: Cursor
This commit is contained in:
@@ -0,0 +1,244 @@
|
||||
import shutil
|
||||
import sys
|
||||
import types
|
||||
from os import path as osp
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import constants as c_mod
|
||||
|
||||
|
||||
def _stub_train_dependencies():
|
||||
if getattr(_stub_train_dependencies, "_done", False):
|
||||
return
|
||||
|
||||
def add_mod(name):
|
||||
if name in sys.modules:
|
||||
return sys.modules[name]
|
||||
m = types.ModuleType(name)
|
||||
sys.modules[name] = m
|
||||
return m
|
||||
|
||||
ultra = add_mod("ultralytics")
|
||||
|
||||
class YOLO:
|
||||
pass
|
||||
|
||||
ultra.YOLO = YOLO
|
||||
|
||||
def fake_client(*_a, **_k):
|
||||
return types.SimpleNamespace(
|
||||
upload_fileobj=lambda *_a, **_k: None,
|
||||
download_file=lambda *_a, **_k: None,
|
||||
)
|
||||
|
||||
boto = add_mod("boto3")
|
||||
boto.client = fake_client
|
||||
add_mod("netron")
|
||||
add_mod("requests")
|
||||
_stub_train_dependencies._done = True
|
||||
|
||||
|
||||
_stub_train_dependencies()
|
||||
|
||||
|
||||
def _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
count,
|
||||
corrupt_stems,
|
||||
):
|
||||
constants_patch(tmp_path)
|
||||
import train
|
||||
|
||||
proc_img = Path(c_mod.processed_images_dir)
|
||||
proc_lbl = Path(c_mod.processed_labels_dir)
|
||||
proc_img.mkdir(parents=True, exist_ok=True)
|
||||
proc_lbl.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
imgs = sorted(fixture_images_dir.glob("*.jpg"))[:count]
|
||||
for p in imgs:
|
||||
stem = p.stem
|
||||
shutil.copy2(fixture_images_dir / f"{stem}.jpg", proc_img / f"{stem}.jpg")
|
||||
dst = proc_lbl / f"{stem}.txt"
|
||||
shutil.copy2(fixture_labels_dir / f"{stem}.txt", dst)
|
||||
if stem in corrupt_stems:
|
||||
dst.write_text("0 1.5 0.5 0.1 0.1\n", encoding="utf-8")
|
||||
|
||||
today_ds = osp.join(c_mod.datasets_dir, train.today_folder)
|
||||
monkeypatch.setattr(train, "today_dataset", today_ds)
|
||||
monkeypatch.setattr(train, "processed_images_dir", c_mod.processed_images_dir)
|
||||
monkeypatch.setattr(train, "processed_labels_dir", c_mod.processed_labels_dir)
|
||||
monkeypatch.setattr(train, "corrupted_images_dir", c_mod.corrupted_images_dir)
|
||||
monkeypatch.setattr(train, "corrupted_labels_dir", c_mod.corrupted_labels_dir)
|
||||
monkeypatch.setattr(train, "datasets_dir", c_mod.datasets_dir)
|
||||
return train
|
||||
|
||||
|
||||
def _count_jpg(p):
|
||||
return len(list(Path(p).glob("*.jpg")))
|
||||
|
||||
|
||||
def test_bt_dsf_01_split_ratio_70_20_10(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
100,
|
||||
set(),
|
||||
)
|
||||
train.form_dataset()
|
||||
base = train.today_dataset
|
||||
assert _count_jpg(Path(base, "train", "images")) == 70
|
||||
assert _count_jpg(Path(base, "valid", "images")) == 20
|
||||
assert _count_jpg(Path(base, "test", "images")) == 10
|
||||
|
||||
|
||||
def test_bt_dsf_02_six_subdirectories(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
100,
|
||||
set(),
|
||||
)
|
||||
train.form_dataset()
|
||||
base = Path(train.today_dataset)
|
||||
assert (base / "train" / "images").is_dir()
|
||||
assert (base / "train" / "labels").is_dir()
|
||||
assert (base / "valid" / "images").is_dir()
|
||||
assert (base / "valid" / "labels").is_dir()
|
||||
assert (base / "test" / "images").is_dir()
|
||||
assert (base / "test" / "labels").is_dir()
|
||||
|
||||
|
||||
def test_bt_dsf_03_total_files_one_hundred(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
100,
|
||||
set(),
|
||||
)
|
||||
train.form_dataset()
|
||||
base = train.today_dataset
|
||||
n = (
|
||||
_count_jpg(Path(base, "train", "images"))
|
||||
+ _count_jpg(Path(base, "valid", "images"))
|
||||
+ _count_jpg(Path(base, "test", "images"))
|
||||
)
|
||||
assert n == 100
|
||||
|
||||
|
||||
def test_bt_dsf_04_corrupted_labels_quarantined(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
stems = [p.stem for p in sorted(fixture_images_dir.glob("*.jpg"))[:100]]
|
||||
corrupt = set(stems[:5])
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
100,
|
||||
corrupt,
|
||||
)
|
||||
train.form_dataset()
|
||||
base = train.today_dataset
|
||||
split_total = (
|
||||
_count_jpg(Path(base, "train", "images"))
|
||||
+ _count_jpg(Path(base, "valid", "images"))
|
||||
+ _count_jpg(Path(base, "test", "images"))
|
||||
)
|
||||
assert split_total == 95
|
||||
assert _count_jpg(c_mod.corrupted_images_dir) == 5
|
||||
assert len(list(Path(c_mod.corrupted_labels_dir).glob("*.txt"))) == 5
|
||||
|
||||
|
||||
@pytest.mark.resilience
|
||||
def test_rt_dsf_01_empty_processed_no_crash(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
0,
|
||||
set(),
|
||||
)
|
||||
train.form_dataset()
|
||||
base = Path(train.today_dataset)
|
||||
assert base.is_dir()
|
||||
|
||||
|
||||
@pytest.mark.resource_limit
|
||||
def test_rl_dsf_01_split_ratios_sum_hundred():
|
||||
import train
|
||||
|
||||
assert train.train_set + train.valid_set + train.test_set == 100
|
||||
|
||||
|
||||
@pytest.mark.resource_limit
|
||||
def test_rl_dsf_02_no_filename_duplication_across_splits(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
):
|
||||
train = _prepare_form_dataset(
|
||||
monkeypatch,
|
||||
tmp_path,
|
||||
constants_patch,
|
||||
fixture_images_dir,
|
||||
fixture_labels_dir,
|
||||
100,
|
||||
set(),
|
||||
)
|
||||
train.form_dataset()
|
||||
base = Path(train.today_dataset)
|
||||
names = []
|
||||
for split in ("train", "valid", "test"):
|
||||
for f in (base / split / "images").glob("*.jpg"):
|
||||
names.append(f.name)
|
||||
assert len(names) == len(set(names))
|
||||
assert len(names) == 100
|
||||
Reference in New Issue
Block a user