Source code for datamodules.RolfFormat.datasets.dataset

"""
Load a dataset of historic documents by specifying the folder where its located.
"""

# Utils
import re
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import List, Tuple, Union

import torch.utils.data as data
from torch import is_tensor
from PIL import Image
from torchvision.datasets.folder import pil_loader
from torchvision.transforms import ToTensor

from src.datamodules.utils.misc import ImageDimensions, get_output_file_list
from src.utils import utils

IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.gif')

log = utils.get_logger(__name__)


[docs]@dataclass
class DatasetSpecs:
    """
    This class is used to specify the location of the data and ground truth files. It can also be used to
    specify a range of files that should be used. This is useful if you want to split the data into train/val/test
    and want to use the same data root for all three splits.
    """
    data_root: str
    doc_dir: str
    doc_names: str
    gt_dir: str
    gt_names: str
    range_from: int
    range_to: int


[docs]class DatasetRolfFormat(data.Dataset):
    """
    Dataset that loads the data in the Rolf format. Each file name has a fixed structure of `name_{file_number}.jpg`.
    The file number is a number between 0 and 9999.
    The different splits are defined by giving a range and a root folder for each split.

    :param dataset_specs: The dataset specs that specify the location of the data and ground truth files.
    :type dataset_specs: List[DatasetSpecs]
    :param image_dims: The dimensions of the images.
    :type image_dims: ImageDimensions
    :param is_test: Is it the test dataset?
    :type is_test: bool
    :param image_transform: Transformations that should be applied to the image.
    :type image_transform: callable
    :param target_transform: Transformations that should be applied to the ground truth.
    :type target_transform: callable
    :param twin_transform: Transformations that should be applied to both the image and the ground truth.
    :type twin_transform: callable
    """

    def __init__(self, dataset_specs: List[DatasetSpecs], image_dims: ImageDimensions,
                 is_test: bool = False, image_transform: callable = None, target_transform: callable = None,
                 twin_transform: callable = None):
        """
        Constructor method for the DatasetRolfFormat class.
        """

        self.dataset_specs = dataset_specs

        self.image_dims = image_dims

        # transformations
        self.image_transform = image_transform
        self.target_transform = target_transform
        self.twin_transform = twin_transform

        self.is_test = is_test

        # List of tuples that contain the path to the gt and image that belong together
        self.img_gt_path_list = self.get_img_gt_path_list(list_specs=self.dataset_specs)

        if is_test:
            self.image_path_list = [img_gt_path[0] for img_gt_path in self.img_gt_path_list]
            self.output_file_list = get_output_file_list(image_path_list=self.image_path_list)

        self.num_samples = len(self.img_gt_path_list)

        assert self.num_samples > 0

    def __len__(self):
        """
        This function returns the length of an epoch so the data loader knows when to stop.
        The length is different during train/val and test, because we process the whole image during testing,
        and only sample from the images during train/val.
        """
        return self.num_samples

    def __getitem__(self, index: int) -> Union[Tuple[Image.Image, Image.Image], Tuple[Image.Image, Image.Image, int]]:
        """
        This function returns the image and the ground truth for a given index. If it is the test dataset,

        :param index: The index of the sample that should be returned.
        :type index: int
        :return: The image and the ground truth for the given index.
        :rtype: tuple
        """
        if self.is_test:
            return self._get_test_items(index=index)
        else:
            return self._get_train_val_items(index=index)

    def _get_train_val_items(self, index: int) -> Tuple[Image.Image, Image.Image]:
        """
        This function returns the image and the ground truth for a given index.

        :param index: The index of the sample that should be returned.
        :type index: int
        :return: The image and the ground truth for the given index.
        :rtype: tuple
        """
        data_img, gt_img = self._load_data_and_gt(index=index)
        img, gt = self._apply_transformation(data_img, gt_img)
        return img, gt

    def _get_test_items(self, index: int) -> Tuple[Image.Image, Image.Image, int]:
        """
        This function returns the image and the ground truth for a given index.

        :param index: The index of the sample that should be returned.
        :type index: int
        :return: The image and the ground truth for the given index with the index.
        :rtype: tuple
        :return:
        """
        data_img, gt_img = self._load_data_and_gt(index=index)
        img, gt = self._apply_transformation(data_img, gt_img)
        return img, gt, index

    def _load_data_and_gt(self, index: int) -> Tuple[Image.Image, Image.Image]:
        """
        This function loads the image and the ground truth for a given index.

        :param index: The index of the sample that should be returned.
        :type index: int
        :return: The image and the ground truth for the given index.
        :rtype: tuple
        """
        data_img = pil_loader(str(self.img_gt_path_list[index][0]))
        gt_img = pil_loader(str(self.img_gt_path_list[index][1]))

        assert data_img.height == self.image_dims.height and data_img.width == self.image_dims.width
        assert gt_img.height == self.image_dims.height and gt_img.width == self.image_dims.width

        return data_img, gt_img

    def _apply_transformation(self, img: Image.Image, gt: Image.Image) -> Tuple[Image.Image, Image.Image]:
        """
        Applies the transformations that have been defined in the setup (setup.py). If no transformations
        have been defined, the PIL image is returned instead.

        :param img: The original image onto which the transformations should be applied.
        :type img: Image.Image
        :param gt: The ground truth onto which the transformations should be applied.
        :type gt: Image.Image
        :return: The transformed image and ground truth.
        :rtype: Tuple[Image.Image, Image.Image]
        """
        if self.twin_transform is not None and not self.is_test:
            img, gt = self.twin_transform(img, gt)

        if self.image_transform is not None:
            # perform transformations
            img, gt = self.image_transform(img, gt)

        if not is_tensor(img):
            img = ToTensor()(img)
        if not is_tensor(gt):
            gt = ToTensor()(gt)

        if self.target_transform is not None:
            img, gt = self.target_transform(img, gt)

        return img, gt

    @staticmethod
    def _get_paths_from_specs(data_root: str,
                              doc_dir: str, doc_names: str,
                              gt_dir: str, gt_names: str,
                              range_from: int, range_to: int) -> List[Tuple[Path, Path]]:
        """
        This function returns a list of tuples that contain the path to the gt and image that belong together.

        :param data_root: The root where the data is located.
        :type data_root: str
        :param doc_dir: The directory where the images are located.
        :type doc_dir: str
        :param doc_names: The name of the images.
        :type doc_names: str
        :param gt_dir: The directory where the ground truth is located.
        :type gt_dir: str
        :param gt_names: The name of the ground truth.
        :type gt_names: str
        :param range_from: The first index of the range that should be used.
        :type range_from: int
        :param range_to: The last index of the range that should be used.
        :type range_to: int
        :return: A list of tuples that contain the path to the gt and image that belong together.
        :rtype: List[Tuple[Path, Path]]
        """

        path_root = Path(data_root)
        path_doc_dir = path_root / doc_dir
        path_gt_dir = path_root / gt_dir

        if not path_doc_dir.is_dir():
            log.error(f'Document directory not found ("{path_doc_dir}")!')

        if not path_gt_dir.is_dir():
            log.error(f'Ground Truth directory not found ("{path_gt_dir}")!')

        p = re.compile('#+')

        # assert that there is exactly one placeholder group
        assert len(p.findall(doc_names)) == 1
        assert len(p.findall(gt_names)) == 1

        search_doc_names = p.search(doc_names)
        doc_prefix = doc_names[:search_doc_names.span(0)[0]]
        doc_suffix = doc_names[search_doc_names.span(0)[1]:]
        doc_number_length = len(search_doc_names.group(0))

        search_gt_names = p.search(gt_names)
        gt_prefix = gt_names[:search_gt_names.span(0)[0]]
        gt_suffix = gt_names[search_gt_names.span(0)[1]:]
        gt_number_length = len(search_gt_names.group(0))

        paths = []
        for i in range(range_from, range_to + 1):
            doc_filename = f'{doc_prefix}{i:0{doc_number_length}d}{doc_suffix}'
            path_doc_file = path_doc_dir / doc_filename

            gt_filename = f'{gt_prefix}{i:0{gt_number_length}d}{gt_suffix}'
            path_gt_file = path_gt_dir / gt_filename

            assert path_doc_file.exists() == path_gt_file.exists()

            if path_doc_file.exists() and path_gt_file.exists():
                paths.append((path_doc_file, path_gt_file))

        assert len(paths) > 0

        return paths

[docs]    @staticmethod
    def get_img_gt_path_list(list_specs: List[DatasetSpecs]) -> List[Tuple[Path, Path]]:
        """
        Returns a list of tuples that contain the path to the gt and image that belong together.

        :param list_specs: The dataset specs that specify the location of the data and ground truth files.
        :type list_specs: List[DatasetSpecs]
        :return: A list of tuples that contain the path to the gt and image that belong together.
        :rtype: List[Tuple[Path, Path]]
        """
        paths = []

        for specs in list_specs:
            paths += DatasetRolfFormat._get_paths_from_specs(**asdict(specs))

        return paths