diff --git a/doctr/datasets/datasets/base.py b/doctr/datasets/datasets/base.py index b75f9e7348..dff687395f 100644 --- a/doctr/datasets/datasets/base.py +++ b/doctr/datasets/datasets/base.py @@ -5,6 +5,7 @@ import os import shutil +import traceback from collections.abc import Callable from pathlib import Path from typing import Any @@ -46,29 +47,37 @@ def _read_sample(self, index: int) -> tuple[Any, Any]: raise NotImplementedError def __getitem__(self, index: int) -> tuple[Any, Any]: - # Read image - img, target = self._read_sample(index) - # Pre-transforms (format conversion at run-time etc.) - if self._pre_transforms is not None: - img, target = self._pre_transforms(img, target) - - if self.img_transforms is not None: - # typing issue cf. https://github.com/python/mypy/issues/5485 - img = self.img_transforms(img) - - if self.sample_transforms is not None: - # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. - if ( - isinstance(target, dict) - and all(isinstance(item, np.ndarray) for item in target.values()) - and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target - ): - img_transformed = _copy_tensor(img) - for class_name, bboxes in target.items(): - img_transformed, target[class_name] = self.sample_transforms(img, bboxes) - img = img_transformed - else: - img, target = self.sample_transforms(img, target) + try: + img, target = self._read_sample(index) + # Pre-transforms (format conversion at run-time etc.) + if self._pre_transforms is not None: + img, target = self._pre_transforms(img, target) + + if self.img_transforms is not None: + # typing issue cf. https://github.com/python/mypy/issues/5485 + img = self.img_transforms(img) + + if self.sample_transforms is not None: + # Conditions to assess it is detection model with multiple classes and avoid confusion with other tasks. + if ( + isinstance(target, dict) + and all(isinstance(item, np.ndarray) for item in target.values()) + and set(target.keys()) != {"boxes", "labels"} # avoid confusion with obj detection target + ): + img_transformed = _copy_tensor(img) + for class_name, bboxes in target.items(): + img_transformed, target[class_name] = self.sample_transforms(img, bboxes) + img = img_transformed + else: + img, target = self.sample_transforms(img, target) + except Exception: + img_name = self.data[index][0] + # Write + print() + print(f"!!!ERROR in Dataset on filename {img_name}") + traceback.print_exc() + print() + return self.__getitem__(0) # should exists ^^ return img, target diff --git a/doctr/datasets/detection.py b/doctr/datasets/detection.py index 0c8e79abab..f1b0307b39 100644 --- a/doctr/datasets/detection.py +++ b/doctr/datasets/detection.py @@ -54,14 +54,21 @@ def __init__( self.data: list[tuple[str, tuple[np.ndarray, list[str]]]] = [] np_dtype = np.float32 + + missing_files = [] for img_name, label in labels.items(): # File existence check if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) - - self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + geoms, polygons_classes = self.format_polygons(label["polygons"], use_polygons, np_dtype) + self.data.append((img_name, (np.asarray(geoms, dtype=np_dtype), polygons_classes))) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def format_polygons( self, polygons: list | dict, use_polygons: bool, np_dtype: type diff --git a/doctr/datasets/recognition.py b/doctr/datasets/recognition.py index b0afca7085..2fe9466060 100644 --- a/doctr/datasets/recognition.py +++ b/doctr/datasets/recognition.py @@ -39,11 +39,18 @@ def __init__( with open(labels_path, encoding="utf-8") as f: labels = json.load(f) + missing_files = [] for img_name, label in labels.items(): if not os.path.exists(os.path.join(self.root, img_name)): - raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") - - self.data.append((img_name, label)) + missing_files.append(img_name) + # raise FileNotFoundError(f"unable to locate {os.path.join(self.root, img_name)}") + else: + self.data.append((img_name, label)) + print("List of missing files:") + print(f"MISSING FILES: {len(missing_files)}") + from pprint import pprint + + pprint(missing_files) def merge_dataset(self, ds: AbstractDataset) -> None: # Update data with new root for self diff --git a/references/requirements.txt b/references/requirements.txt index 90e24543de..84d95095bd 100644 --- a/references/requirements.txt +++ b/references/requirements.txt @@ -1,6 +1,7 @@ -e . tqdm slack-sdk +boto3>=1.9 wandb>=0.10.31 clearml>=1.11.1 matplotlib>=3.1.0