Source code for datamodules.Classification.datamodule

from pathlib import Path
from typing import Union, List, Optional, Dict, Callable

from import DataLoader
from torchvision import transforms
from torchvision.datasets import ImageFolder

from src.datamodules.Classification.utils.image_analytics import get_analytics_data_image_folder
from src.datamodules.Classification.utils.misc import validate_path_for_classification
from src.datamodules.base_datamodule import AbstractDatamodule
from src.datamodules.utils.misc import get_image_dims
from src.utils import utils

log = utils.get_logger(__name__)

[docs]class ClassificationDatamodule(AbstractDatamodule): """ Datamodule for a classification task. It takes advantage of the ImageFolder class from PyTorch The data is expected to be in the following format:: data_dir ├── train │ ├── 0 │ │ ├── image_1.png │ │ ├── ... │ │ └── image_N.png │ ├── ... │ └── N │ ├── image_1.png │ ├── ... │ └── image_N.png ├── val │ ├── 0 │ │ ├── image_1.png │ │ ├── ... │ │ └── image_N.png │ ├── ... │ └── N │ ├── image_1.png │ ├── ... │ └── image_N.png └── test ├── 0 │ ├── image_1.png │ ├── ... │ └── image_N.png ├── ... └── N ├── image_1.png ├── ... └── image_N.png :param data_dir: Path to the root directory of the dataset. :type data_dir: str :param selection_train: Either an integer or a list of strings. If an integer is provided, the first n classes are selected. If a list of strings is provided, the classes with the given names are selected. :type selection_train: Optional[Union[int, List[str]]] :param selection_val: Either an integer or a list of strings. If an integer is provided, the first n classes are selected. If a list of strings is provided, the classes with the given names are selected. :type selection_val: Optional[Union[int, List[str]]] :param num_workers: Number of workers for the dataloaders. :type num_workers: int :param batch_size: Batch size for the dataloaders. :type batch_size: int :param shuffle: Whether to shuffle the data. :type shuffle: bool :param drop_last: Whether to drop the last batch if it is smaller than the batch size. :type drop_last: bool """ def __init__(self, data_dir: str, selection_train: Optional[Union[int, List[str]]] = None, selection_val: Optional[Union[int, List[str]]] = None, num_workers: int = 4, batch_size: int = 8, shuffle: bool = True, drop_last: bool = True): """ Constructor method for the ClassificationDatamodule class. """ super().__init__() analytics_data = get_analytics_data_image_folder(input_path=Path(data_dir)) self.mean = analytics_data['mean'] self.std = analytics_data['std'] # error self.image_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=self.mean, std=self.std), ]) self.num_workers = num_workers self.batch_size = batch_size self.shuffle = shuffle self.drop_last = drop_last self.data_dir = validate_path_for_classification(data_dir=data_dir) self.selection_train = selection_train self.selection_val = selection_val train_set = ImageFolder(**self._create_dataset_parameters('train')) self.classes = train_set.classes self.num_classes = len(self.classes) image_dims = get_image_dims( data_gt_path_list=train_set.imgs) self.image_dims = image_dims self.dims = (3, self.image_dims.width, self.image_dims.height) self.train = None self.val = None self.train_loader = None self.val_loader = None
[docs] def setup(self, stage: Optional[str] = None): super().setup() if stage == 'fit' or stage is None: self.train = ImageFolder(**self._create_dataset_parameters('train'))'Initialized train dataset with {len(self.train)} samples.') self.check_min_num_samples(self.trainer.num_devices, self.batch_size, num_samples=len(self.train), data_split='train', drop_last=self.drop_last) self.val = ImageFolder(**self._create_dataset_parameters('val'))'Initialized val dataset with {len(self.val)} samples.') self.check_min_num_samples(self.trainer.num_devices, self.batch_size, num_samples=len(self.val), data_split='val', drop_last=self.drop_last) if stage == 'test': raise ValueError('Test data is not available for Classification.')
[docs] def train_dataloader(self, *args, **kwargs) -> DataLoader: return DataLoader(self.train, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=self.shuffle, drop_last=self.drop_last, pin_memory=True)
[docs] def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]: return DataLoader(self.val, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=self.shuffle, drop_last=self.drop_last, pin_memory=True)
[docs] def test_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]: raise ValueError('Test data is not available for Classification.')
def _create_dataset_parameters(self, dataset_type: str = 'train') -> Dict[str, Union[Path, Callable]]: """ Creates the parameters for the ImageFolder dataset. :param dataset_type: Type of the dataset. Either 'train', 'val' or 'test'. :type dataset_type: str :return: Parameters for the ImageFolder dataset. :rtype: Dict[str, Union[Path, Callable]] """ return {'root': self.data_dir / dataset_type, 'transform': self.image_transform, }