Files
ai-training/tests/test_dataset_formation.py
T
Oleksandr Bezdieniezhnykh a47fa135de Update configuration and test structure for improved clarity and functionality
- Modified `.gitignore` to include test fixture data while excluding test results.
- Updated `config.yaml` to change the model from 'yolo11m.yaml' to 'yolo26m.pt'.
- Enhanced `.cursor/rules/coderule.mdc` with additional guidelines for test environment consistency and infrastructure handling.
- Revised autopilot state management in `_docs/_autopilot_state.md` to reflect current progress and tasks.
- Removed outdated augmentation tests and adjusted dataset formation tests to align with the new structure.

These changes streamline the configuration and testing processes, ensuring better organization and clarity in the project.
2026-03-28 06:11:55 +02:00

218 lines
5.1 KiB
Python

import shutil
from os import path as osp
from pathlib import Path
import pytest
import constants as c_mod
def _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
count,
corrupt_stems,
):
constants_patch(tmp_path)
import train
proc_img = Path(c_mod.config.processed_images_dir)
proc_lbl = Path(c_mod.config.processed_labels_dir)
proc_img.mkdir(parents=True, exist_ok=True)
proc_lbl.mkdir(parents=True, exist_ok=True)
imgs = sorted(fixture_images_dir.glob("*.jpg"))[:count]
for p in imgs:
stem = p.stem
shutil.copy2(fixture_images_dir / f"{stem}.jpg", proc_img / f"{stem}.jpg")
dst = proc_lbl / f"{stem}.txt"
shutil.copy2(fixture_labels_dir / f"{stem}.txt", dst)
if stem in corrupt_stems:
dst.write_text("0 1.5 0.5 0.1 0.1\n", encoding="utf-8")
today_ds = osp.join(c_mod.config.datasets_dir, train.today_folder)
return train, today_ds
def _count_jpg(p):
return len(list(Path(p).glob("*.jpg")))
def test_bt_dsf_01_split_ratio_70_20_10(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
assert _count_jpg(Path(today_ds, "train", "images")) == 14
assert _count_jpg(Path(today_ds, "valid", "images")) == 4
assert _count_jpg(Path(today_ds, "test", "images")) == 2
def test_bt_dsf_02_six_subdirectories(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
base = Path(today_ds)
assert (base / "train" / "images").is_dir()
assert (base / "train" / "labels").is_dir()
assert (base / "valid" / "images").is_dir()
assert (base / "valid" / "labels").is_dir()
assert (base / "test" / "images").is_dir()
assert (base / "test" / "labels").is_dir()
def test_bt_dsf_03_total_files_twenty(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
n = (
_count_jpg(Path(today_ds, "train", "images"))
+ _count_jpg(Path(today_ds, "valid", "images"))
+ _count_jpg(Path(today_ds, "test", "images"))
)
assert n == 20
def test_bt_dsf_04_corrupted_labels_quarantined(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
stems = [p.stem for p in sorted(fixture_images_dir.glob("*.jpg"))[:20]]
corrupt = set(stems[:5])
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
corrupt,
)
# Act
train.form_dataset()
# Assert
split_total = (
_count_jpg(Path(today_ds, "train", "images"))
+ _count_jpg(Path(today_ds, "valid", "images"))
+ _count_jpg(Path(today_ds, "test", "images"))
)
assert split_total == 15
assert _count_jpg(c_mod.config.corrupted_images_dir) == 5
assert len(list(Path(c_mod.config.corrupted_labels_dir).glob("*.txt"))) == 5
@pytest.mark.resilience
def test_rt_dsf_01_empty_processed_no_crash(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
0,
set(),
)
# Act
train.form_dataset()
# Assert
assert Path(today_ds).is_dir()
@pytest.mark.resource_limit
def test_rl_dsf_01_split_ratios_sum_hundred():
# Assert
import train
assert train.train_set + train.valid_set + train.test_set == 100
@pytest.mark.resource_limit
def test_rl_dsf_02_no_filename_duplication_across_splits(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
base = Path(today_ds)
names = []
for split in ("train", "valid", "test"):
for f in (base / split / "images").glob("*.jpg"):
names.append(f.name)
assert len(names) == len(set(names))
assert len(names) == 20