import shutil from os import path as osp from pathlib import Path import pytest import constants as c_mod def _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, count, corrupt_stems, ): constants_patch(tmp_path) import train proc_img = Path(c_mod.config.processed_images_dir) proc_lbl = Path(c_mod.config.processed_labels_dir) proc_img.mkdir(parents=True, exist_ok=True) proc_lbl.mkdir(parents=True, exist_ok=True) imgs = sorted(fixture_images_dir.glob("*.jpg"))[:count] for p in imgs: stem = p.stem shutil.copy2(fixture_images_dir / f"{stem}.jpg", proc_img / f"{stem}.jpg") dst = proc_lbl / f"{stem}.txt" shutil.copy2(fixture_labels_dir / f"{stem}.txt", dst) if stem in corrupt_stems: dst.write_text("0 1.5 0.5 0.1 0.1\n", encoding="utf-8") today_ds = osp.join(c_mod.config.datasets_dir, train.today_folder) return train, today_ds def _count_jpg(p): return len(list(Path(p).glob("*.jpg"))) def test_bt_dsf_01_split_ratio_70_20_10( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 20, set(), ) # Act train.form_dataset() # Assert assert _count_jpg(Path(today_ds, "train", "images")) == 14 assert _count_jpg(Path(today_ds, "valid", "images")) == 4 assert _count_jpg(Path(today_ds, "test", "images")) == 2 def test_bt_dsf_02_six_subdirectories( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 20, set(), ) # Act train.form_dataset() # Assert base = Path(today_ds) assert (base / "train" / "images").is_dir() assert (base / "train" / "labels").is_dir() assert (base / "valid" / "images").is_dir() assert (base / "valid" / "labels").is_dir() assert (base / "test" / "images").is_dir() assert (base / "test" / "labels").is_dir() def test_bt_dsf_03_total_files_twenty( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 20, set(), ) # Act train.form_dataset() # Assert n = ( _count_jpg(Path(today_ds, "train", "images")) + _count_jpg(Path(today_ds, "valid", "images")) + _count_jpg(Path(today_ds, "test", "images")) ) assert n == 20 def test_bt_dsf_04_corrupted_labels_quarantined( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange stems = [p.stem for p in sorted(fixture_images_dir.glob("*.jpg"))[:20]] corrupt = set(stems[:5]) train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 20, corrupt, ) # Act train.form_dataset() # Assert split_total = ( _count_jpg(Path(today_ds, "train", "images")) + _count_jpg(Path(today_ds, "valid", "images")) + _count_jpg(Path(today_ds, "test", "images")) ) assert split_total == 15 assert _count_jpg(c_mod.config.corrupted_images_dir) == 5 assert len(list(Path(c_mod.config.corrupted_labels_dir).glob("*.txt"))) == 5 @pytest.mark.resilience def test_rt_dsf_01_empty_processed_no_crash( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 0, set(), ) # Act train.form_dataset() # Assert assert Path(today_ds).is_dir() @pytest.mark.resource_limit def test_rl_dsf_01_split_ratios_sum_hundred(): # Assert import train assert train.train_set + train.valid_set + train.test_set == 100 @pytest.mark.resource_limit def test_rl_dsf_02_no_filename_duplication_across_splits( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, ): # Arrange train, today_ds = _prepare_form_dataset( monkeypatch, tmp_path, constants_patch, fixture_images_dir, fixture_labels_dir, 20, set(), ) # Act train.form_dataset() # Assert base = Path(today_ds) names = [] for split in ("train", "valid", "test"): for f in (base / split / "images").glob("*.jpg"): names.append(f.name) assert len(names) == len(set(names)) assert len(names) == 20