Files
ai-training/tests/test_dataset_formation.py
T
Oleksandr Bezdieniezhnykh 18b88ba9bf Refactor configuration and update test structure for improved clarity
- Updated `.gitignore` to remove committed test fixture data exclusions.
- Increased batch size in `config.test.yaml` from 4 to 128 for training.
- Simplified directory structure in `config.yaml` by removing unnecessary data paths.
- Adjusted paths in `augmentation.py`, `dataset-visualiser.py`, and `exports.py` to align with the new configuration structure.
- Enhanced `annotation_queue_handler.py` to utilize the updated configuration for directory management.
- Added CSV logging of test results in `conftest.py` for better test reporting.

These changes streamline the configuration management and enhance the testing framework, ensuring better organization and clarity in the project.
2026-03-28 07:32:40 +02:00

218 lines
5.1 KiB
Python

import shutil
from os import path as osp
from pathlib import Path
import pytest
import constants as c_mod
def _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
count,
corrupt_stems,
):
constants_patch(tmp_path)
import train
data_img = Path(c_mod.config.images_dir)
data_lbl = Path(c_mod.config.labels_dir)
data_img.mkdir(parents=True, exist_ok=True)
data_lbl.mkdir(parents=True, exist_ok=True)
imgs = sorted(fixture_images_dir.glob("*.jpg"))[:count]
for p in imgs:
stem = p.stem
shutil.copy2(fixture_images_dir / f"{stem}.jpg", data_img / f"{stem}.jpg")
dst = data_lbl / f"{stem}.txt"
shutil.copy2(fixture_labels_dir / f"{stem}.txt", dst)
if stem in corrupt_stems:
dst.write_text("0 1.5 0.5 0.1 0.1\n", encoding="utf-8")
today_ds = osp.join(c_mod.config.datasets_dir, train.today_folder)
return train, today_ds
def _count_jpg(p):
return len(list(Path(p).glob("*.jpg")))
def test_bt_dsf_01_split_ratio_70_20_10(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
assert _count_jpg(Path(today_ds, "train", "images")) == 14
assert _count_jpg(Path(today_ds, "valid", "images")) == 4
assert _count_jpg(Path(today_ds, "test", "images")) == 2
def test_bt_dsf_02_six_subdirectories(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
base = Path(today_ds)
assert (base / "train" / "images").is_dir()
assert (base / "train" / "labels").is_dir()
assert (base / "valid" / "images").is_dir()
assert (base / "valid" / "labels").is_dir()
assert (base / "test" / "images").is_dir()
assert (base / "test" / "labels").is_dir()
def test_bt_dsf_03_total_files_twenty(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
n = (
_count_jpg(Path(today_ds, "train", "images"))
+ _count_jpg(Path(today_ds, "valid", "images"))
+ _count_jpg(Path(today_ds, "test", "images"))
)
assert n == 20
def test_bt_dsf_04_corrupted_labels_quarantined(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
stems = [p.stem for p in sorted(fixture_images_dir.glob("*.jpg"))[:20]]
corrupt = set(stems[:5])
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
corrupt,
)
# Act
train.form_dataset()
# Assert
split_total = (
_count_jpg(Path(today_ds, "train", "images"))
+ _count_jpg(Path(today_ds, "valid", "images"))
+ _count_jpg(Path(today_ds, "test", "images"))
)
assert split_total == 15
assert _count_jpg(c_mod.config.corrupted_images_dir) == 5
assert len(list(Path(c_mod.config.corrupted_labels_dir).glob("*.txt"))) == 5
@pytest.mark.resilience
def test_rt_dsf_01_empty_data_no_crash(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
0,
set(),
)
# Act
train.form_dataset()
# Assert
assert Path(today_ds).is_dir()
@pytest.mark.resource_limit
def test_rl_dsf_01_split_ratios_sum_hundred():
# Assert
import train
assert train.train_set + train.valid_set + train.test_set == 100
@pytest.mark.resource_limit
def test_rl_dsf_02_no_filename_duplication_across_splits(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
):
# Arrange
train, today_ds = _prepare_form_dataset(
monkeypatch,
tmp_path,
constants_patch,
fixture_images_dir,
fixture_labels_dir,
20,
set(),
)
# Act
train.form_dataset()
# Assert
base = Path(today_ds)
names = []
for split in ("train", "valid", "test"):
for f in (base / split / "images").glob("*.jpg"):
names.append(f.name)
assert len(names) == len(set(names))
assert len(names) == 20