fedbiomed.node.dataset_manager

Module: fedbiomed.node.dataset_manager

Interfaces with the node component database.

Classes

DatasetManager

CLASS

DatasetManager()

Interfaces with the node component database.

Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.

Source code in fedbiomed/node/dataset_manager.py

def __init__(self):
    """Constructor of the class.
    """
    self._db = TinyDB(environ['DB_PATH'])
    self._database = Query()

    # don't use DB read cache to ensure coherence
    # (eg when mixing CLI commands with a GUI session)
    self._dataset_table = self._db.table(name='Datasets', cache_size=0)
    self._dlp_table = self._db.table(name='Data_Loading_Plans', cache_size=0)

Functions

add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)

Adds a new dataset contained in a file to node's database.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the dataset	required
`data_type`	`str`	File extension/format of the dataset (*.csv, images, ...)	required
`tags`	`Union[tuple, list]`	Tags of the dataset.	required
`description`	`str`	Human readable description of the dataset.	required
`path`	`Optional[str]`	Path to the dataset. Defaults to None.	`None`
`dataset_id`	`Optional[str]`	Id of the dataset. Defaults to None.	`None`
`dataset_parameters`	`Optional[dict]`	a dictionary of additional (customized) parameters, or None	`None`
`data_loading_plan`	`Optional[DataLoadingPlan]`	a DataLoadingPlan to be linked to this dataset, or None	`None`
`save_dlp`	`bool`	if True, save the `data_loading_plan`	`True`

Raises:

Type	Description
`NotImplementedError`	`data_type` is not supported.
`FedbiomedDatasetManagerError`	path does not exist or dataset was not saved properly.

Source code in fedbiomed/node/dataset_manager.pydef add_database(self,
                 name: str,
                 data_type: str,
                 tags: Union[tuple, list],
                 description: str,
                 path: Optional[str] = None,
                 dataset_id: Optional[str] = None,
                 dataset_parameters : Optional[dict] = None,
                 data_loading_plan: Optional[DataLoadingPlan] = None,
                 save_dlp: bool = True):
    """Adds a new dataset contained in a file to node's database.

    Args:
        name: Name of the dataset
        data_type: File extension/format of the
            dataset (*.csv, images, ...)
        tags: Tags of the dataset.
        description: Human readable description of the dataset.
        path: Path to the dataset. Defaults to None.
        dataset_id: Id of the dataset. Defaults to None.
        dataset_parameters: a dictionary of additional (customized) parameters, or None
        data_loading_plan: a DataLoadingPlan to be linked to this dataset, or None
        save_dlp: if True, save the `data_loading_plan`

    Raises:
        NotImplementedError: `data_type` is not supported.
        FedbiomedDatasetManagerError: path does not exist or dataset was not saved properly.
    """
    # Accept tilde as home folder
    if path is not None:
        path = os.path.expanduser(path)

    # Check that there are not existing dataset with conflicting tags
    conflicting = self.search_conflicting_tags(tags)
    if len(conflicting) > 0:
        msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
            f" {' '.join([ c['name'] for c in conflicting ])}"
        logger.critical(msg)
        raise FedbiomedDatasetManagerError(msg)

    dtypes = []  # empty list for Image datasets
    data_types = ['csv', 'default', 'mednist', 'images', 'medical-folder', 'flamby']

    if data_type not in data_types:
        raise NotImplementedError(f'Data type {data_type} is not'
                                  ' a compatible data type. '
                                  f'Compatible data types are: {data_types}')

    elif data_type == 'flamby':
        # check that data loading plan is present and well formed
        if data_loading_plan is None or \
                FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA not in data_loading_plan:
            msg = f"{ErrorNumbers.FB316.value}. A DataLoadingPlan containing " \
                  f"{FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA.value} is required for adding a FLamby dataset " \
                  f"to the database."
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

        # initialize a dataset and link to the flamby data. If all goes well, compute shape.
        try:
            dataset = FlambyDataset()
            dataset.set_dlp(data_loading_plan)  # initializes fed_class as a side effect
        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create FLamby dataset. {e}")
        else:
            shape = dataset.shape()

    if data_type == 'default':
        assert os.path.isdir(path), f'Folder {path} for Default Dataset does not exist.'
        shape = self.load_default_database(name, path)

    elif data_type == 'mednist':
        assert os.path.isdir(path), f'Folder {path} for MedNIST Dataset does not exist.'
        shape = self.load_mednist_database(path)
        path = os.path.join(path, 'MedNIST')

    elif data_type == 'csv':
        assert os.path.isfile(path), f'Path provided ({path}) does not correspond to a CSV file.'
        dataset = self.load_csv_dataset(path)
        shape = dataset.shape
        dtypes = self.get_csv_data_types(dataset)

    elif data_type == 'images':
        assert os.path.isdir(path), f'Folder {path} for Images Dataset does not exist.'
        shape = self.load_images_dataset(path)

    elif data_type == 'medical-folder':
        if not os.path.isdir(path):
            raise FedbiomedDatasetManagerError(f'Folder {path} for Medical Folder Dataset does not exist.')

        if "tabular_file" not in dataset_parameters:
            logger.info("Medical Folder Dataset will be loaded without reference/demographics data.")
        else:
            if not os.path.isfile(dataset_parameters['tabular_file']):
                raise FedbiomedDatasetManagerError(f'Path {dataset_parameters["tabular_file"]} does not '
                                                   f'correspond a file.')
            if "index_col" not in dataset_parameters:
                raise FedbiomedDatasetManagerError('Index column is not provided')

        try:
            # load using the MedicalFolderController to ensure all available modalities are inspected
            controller = MedicalFolderController(root=path)
            if data_loading_plan is not None:
                controller.set_dlp(data_loading_plan)
            dataset = controller.load_MedicalFolder(tabular_file=dataset_parameters.get('tabular_file', None),
                                                    index_col=dataset_parameters.get('index_col', None))

        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create Medical Folder dataset. {e}")
        else:
            shape = dataset.shape()

        # try to read one sample and raise if it doesn't work
        try:
            _ = dataset.get_nontransformed_item(0)
        except Exception as e:
            raise FedbiomedDatasetManagerError(f'Medical Folder Dataset was not saved properly and '
                                               f'cannot be read. {e}')

    if not dataset_id:
        dataset_id = 'dataset_' + str(uuid.uuid4())

    new_database = dict(name=name, data_type=data_type, tags=tags,
                        description=description, shape=shape,
                        path=path, dataset_id=dataset_id, dtypes=dtypes,
                        dataset_parameters=dataset_parameters)
    if save_dlp:
        dlp_id = self.save_data_loading_plan(data_loading_plan)
    elif isinstance(data_loading_plan, DataLoadingPlan):
        dlp_id = data_loading_plan.dlp_id
    else:
        dlp_id = None
    if dlp_id is not None:
        new_database['dlp_id'] = dlp_id
    self._dataset_table.insert(new_database)

    return dataset_id

get_by_id(dataset_id)

Searches for a dataset with given dataset_id.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	A dataset id	required

Returns:

Type	Description
`Union[dict, None]`	A `dict` containing the dataset's description if a dataset with this `dataset_id`
`Union[dict, None]`	exists in the database. `None` if no such dataset exists in the database.

Source code in fedbiomed/node/dataset_manager.pydef get_by_id(self, dataset_id: str) -> Union[dict, None]:
    """Searches for a dataset with given dataset_id.

    Args:
        dataset_id:  A dataset id

    Returns:
        A `dict` containing the dataset's description if a dataset with this `dataset_id`
        exists in the database. `None` if no such dataset exists in the database. 
    """
    result = self._dataset_table.get(self._database.dataset_id == dataset_id)

    return result

get_csv_data_types(dataset)

Gets data types of each variable in dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`pd.DataFrame`	A Pandas dataset.	required

Returns:

Type	Description
`List[str]`	A list of strings containing data types.

Source code in fedbiomed/node/dataset_manager.pydef get_csv_data_types(self, dataset: pd.DataFrame) -> List[str]:
    """Gets data types of each variable in dataset.

    Args:
        dataset: A Pandas dataset.

    Returns:
        A list of strings containing data types.
    """
    types = [str(t) for t in dataset.dtypes]

    return types

get_data_loading_blocks_by_ids(dlb_ids)

Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

Parameters:

Name	Type	Description	Default
`dlb_ids`	`List[str]`	(List[str]) a list of DataLoadingBlock IDs	required

Returns:

Type	Description
`List[dict]`	A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.

Source code in fedbiomed/node/dataset_manager.pydef get_data_loading_blocks_by_ids(self, dlb_ids: List[str]) -> List[dict]:
    """Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

    Args:
        dlb_ids: (List[str]) a list of DataLoadingBlock IDs

    Returns:
        A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.
    """
    return self._dlp_table.search(self._database.dlb_id.one_of(dlb_ids))

get_dlp_by_id(dlp_id)

Search for a DataLoadingPlan with a given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

Parameters:

Name	Type	Description	Default
`dlp_id`	`str`	(str) the DataLoadingPlan id	required

Returns:

Type	Description
`Tuple[dict, List[dict]]`	A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.

Source code in fedbiomed/node/dataset_manager.pydef get_dlp_by_id(self, dlp_id: str) -> Tuple[dict, List[dict]]:
    """Search for a DataLoadingPlan with a given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

    Args:
        dlp_id: (str) the DataLoadingPlan id

    Returns:
        A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.
    """
    dlp_metadata = self._dlp_table.get(self._database.dlp_id == dlp_id)
    return dlp_metadata, self._dlp_table.search(
        self._database.dlb_id.one_of(dlp_metadata['loading_blocks'].values()))

get_torch_dataset_shape(dataset)

Gets info about dataset shape.

Parameters:

Name	Type	Description	Default
`dataset`	`torch.utils.data.Dataset`	A Pytorch dataset	required

Returns:

Type	Description
`List[int]`	A list of int containing [, ]. Example for MNIST: [60000, 1, 28, 28], where =60000 and =1, 28, 28

Source code in fedbiomed/node/dataset_manager.pydef get_torch_dataset_shape(self, dataset: torch.utils.data.Dataset) -> List[int]:
    """Gets info about dataset shape.

    Args:
        dataset: A Pytorch dataset

    Returns:
        A list of int containing
            [<nb_of_data>, <dimension_of_first_input_data>].
            Example for MNIST: [60000, 1, 28, 28], where <nb_of_data>=60000
            and <dimension_of_first_input_data>=1, 28, 28
    """
    return [len(dataset)] + list(dataset[0][0].shape)

list_dlp(target_dataset_type=None)

Return all existing DataLoadingPlans.

Parameters:

Name	Type	Description	Default
`target_dataset_type`	`Optional[str]`	(str or None) if specified, return only dlps matching the requested target type.	`None`

Returns:

Type	Description
`List[dict]`	An array of dict, each dict is a DataLoadingPlan

Source code in fedbiomed/node/dataset_manager.pydef list_dlp(self, target_dataset_type: Optional[str] = None) -> List[dict]:
    """Return all existing DataLoadingPlans.

    Args:
        target_dataset_type: (str or None) if specified, return only dlps matching the requested target type.

    Returns:
        An array of dict, each dict is a DataLoadingPlan
    """
    if target_dataset_type is not None:
        if not isinstance(target_dataset_type, str):
            raise FedbiomedDatasetManagerError(f"Wrong input type for target_dataset_type. "
                                               f"Expected str, got {type(target_dataset_type)} instead.")
        if target_dataset_type not in [t.value for t in DatasetTypes]:
            raise FedbiomedDatasetManagerError("target_dataset_type should be of the values defined in "
                                               "fedbiomed.common.constants.DatasetTypes")

        dlps = self._dlp_table.search(
            (self._database.dlp_id.exists()) &
            (self._database.dlp_name.exists()) &
            (self._database.target_dataset_type == target_dataset_type))
    else:
        dlps = self._dlp_table.search(
            (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()))
    return [dict(dlp) for dlp in dlps]

list_my_data(verbose=True)

Lists all datasets on the node.

Parameters:

Name	Type	Description	Default
`verbose`	`bool`	Give verbose output. Defaults to True.	`True`

Returns:

Type	Description
`List[dict]`	All datasets in the node's database.

Source code in fedbiomed/node/dataset_manager.pydef list_my_data(self, verbose: bool = True) -> List[dict]:
    """Lists all datasets on the node.

    Args:
        verbose: Give verbose output. Defaults to True.

    Returns:
        All datasets in the node's database.
    """
    my_data = self._dataset_table.all()

    # Do not display dtypes
    for doc in my_data:
        doc.pop('dtypes')

    if verbose:
        print(tabulate(my_data, headers='keys'))

    return my_data

load_as_dataloader(dataset)

Loads content of an image dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`dict`	Description of the dataset.	required

Returns:

Type	Description
`torch.utils.data.Dataset`	Content of the dataset.

Source code in fedbiomed/node/dataset_manager.pydef load_as_dataloader(self, dataset: dict) -> torch.utils.data.Dataset:
    """Loads content of an image dataset.

    Args:
        dataset: Description of the dataset.

    Returns:
        Content of the dataset.
    """
    name = dataset['data_type']
    if name == 'default':
        return self.load_default_database(name=dataset['name'],
                                          path=dataset['path'],
                                          as_dataset=True)
    elif name == 'images':
        return self.load_images_dataset(folder_path=dataset['path'],
                                        as_dataset=True)

load_csv_dataset(path)

Loads a CSV dataset.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the CSV file.	required

Returns:

Type	Description
`pd.DataFrame`	Pandas DataFrame with the content of the file.

Source code in fedbiomed/node/dataset_manager.pydef load_csv_dataset(self, path: str) -> pd.DataFrame:
    """Loads a CSV dataset.

    Args:
        path: Path to the CSV file.

    Returns:
        Pandas DataFrame with the content of the file.
    """
    return self.read_csv(path)

load_data(tags, mode)

Loads content of a dataset.

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	Tags describing the dataset to load.	required
`mode`	`str`	Return format for the dataset content.	required

Raises:

Type	Description
`NotImplementedError`	`mode` is not implemented yet.

Returns:

Type	Description
`Any`	Content of the dataset. Its type depends on the `mode` and dataset.

Source code in fedbiomed/node/dataset_manager.pydef load_data(self, tags: Union[tuple, list], mode: str) -> Any:
    """Loads content of a dataset.

    Args:
        tags: Tags describing the dataset to load.
        mode: Return format for the dataset content.

    Raises:
        NotImplementedError: `mode` is not implemented yet.

    Returns:
        Content of the dataset. Its type depends on the `mode` and dataset.
    """

    # Verify is mode is available
    mode = mode.lower()
    modes = ['pandas', 'torch_dataset', 'torch_tensor', 'numpy']
    if mode not in modes:
        raise NotImplementedError(f'Data mode `{mode}` was not found.'
                                  f' Data modes available: {modes}')

    # Look for dataset in database
    dataset = self.search_by_tags(tags)[0]
    print(dataset)
    assert len(dataset) > 0, f'Dataset with tags {tags} was not found.'

    dataset_path = dataset['path']
    # If path is a file, you will aim to read it with
    if os.path.isfile(dataset_path):
        df = self.read_csv(dataset_path, index_col=0)

        # Load data as requested
        if mode == 'pandas':
            return df
        elif mode == 'numpy':
            return df._get_numeric_data().values
        elif mode == 'torch_tensor':
            return torch.from_numpy(df._get_numeric_data().values)

    elif os.path.isdir(dataset_path):
        if mode == 'torch_dataset':
            return self.load_as_dataloader(dataset)
        elif mode == 'torch_tensor':
            raise NotImplementedError('We are working on this'
                                      ' implementation!')
        elif mode == 'numpy':
            raise NotImplementedError('We are working on this'
                                      'implementation!')
        else:
            raise NotImplementedError(f'Mode `{mode}` has not been'
                                      ' implemented on this version.')

load_default_database(name, path, as_dataset=False)

Loads a default dataset.

Currently, only MNIST dataset is used as the default dataset.

Parameters:

Name	Type	Description	Default
`name`	`str`	Name of the default dataset. Currently, only MNIST is accepted.	required
`path`	`str`	Pathfile to MNIST dataset.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Raises:

Type	Description
`NotImplementedError`	Name is not matching with the name of a default dataset.

Returns:

Type	Description
`Union[List[int], torch.utils.data.Dataset]`	Depends on the value of the parameter `as_dataset`: If
`Union[List[int], torch.utils.data.Dataset]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Union[List[int], torch.utils.data.Dataset]`	If set to False, returns the size of the dataset stored inside
`Union[List[int], torch.utils.data.Dataset]`	a list (type: List[int]).

Source code in fedbiomed/node/dataset_manager.pydef load_default_database(self,
                          name: str,
                          path: str,
                          as_dataset: bool = False) -> Union[List[int],
                                                             torch.utils.data.Dataset]:
    """Loads a default dataset.

    Currently, only MNIST dataset is used as the default dataset.

    Args:
        name: Name of the default dataset. Currently,
            only MNIST is accepted.
        path: Pathfile to MNIST dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        NotImplementedError: Name is not matching with
            the name of a default dataset.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int]).
    """
    kwargs = dict(root=path, download=True, transform=transforms.ToTensor())

    if 'mnist' in name.lower():
        dataset = datasets.MNIST(**kwargs)
    else:
        raise NotImplementedError(f'Default dataset `{name}` has'
                                  'not been implemented.')
    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)

load_images_dataset(folder_path, as_dataset=False)

Loads an image dataset.

Parameters:

Name	Type	Description	Default
`folder_path`	`str`	Path to the directory containing the images.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Returns:

Type	Description
`Union[List[int], torch.utils.data.Dataset]`	Depends on the value of the parameter `as_dataset`: If
`Union[List[int], torch.utils.data.Dataset]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Union[List[int], torch.utils.data.Dataset]`	If set to False, returns the size of the dataset stored inside
`Union[List[int], torch.utils.data.Dataset]`	a list (type: List[int])

Source code in fedbiomed/node/dataset_manager.pydef load_images_dataset(self,
                        folder_path: str,
                        as_dataset: bool = False) -> Union[List[int],
                                                           torch.utils.data.Dataset]:
    """Loads an image dataset.

    Args:
        folder_path: Path to the directory containing the images.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
    """
    try:
        dataset = datasets.ImageFolder(folder_path,
                                       transform=transforms.ToTensor())
    except Exception as e:
        _msg = ErrorNumbers.FB315.value +\
            "\nThe following error was raised while loading dataset from the selected" \
            " path:  " + str(e) + "\nPlease make sure that the selected folder is not empty \
            and doesn't have any empty class folder"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)

load_mednist_database(path, as_dataset=False)

Loads the MedNist dataset.

Parameters:

Name	Type	Description	Default
`path`	`str`	Pathfile to save a local copy of the MedNist dataset.	required
`as_dataset`	`bool`	Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.	`False`

Raises:

Type Description

Type	Description
`FedbiomedDatasetManagerError`	One of the following cases: tarfile cannot be downloaded downloaded tarfile cannot be extracted MedNIST path is empty one of the classes path is empty

FedbiomedDatasetManagerError

One of the following cases:

tarfile cannot be downloaded
downloaded tarfile cannot be extracted
MedNIST path is empty
one of the classes path is empty

Returns:

Type	Description
`Union[List[int], torch.utils.data.Dataset]`	Depends on the value of the parameter `as_dataset`: If
`Union[List[int], torch.utils.data.Dataset]`	set to True, returns dataset (type: torch.utils.data.Dataset).
`Union[List[int], torch.utils.data.Dataset]`	If set to False, returns the size of the dataset stored inside
`Union[List[int], torch.utils.data.Dataset]`	a list (type: List[int])

Source code in fedbiomed/node/dataset_manager.pydef load_mednist_database(self,
                          path: str,
                          as_dataset: bool = False) -> Union[List[int],
                                                             torch.utils.data.Dataset]:
    """Loads the MedNist dataset.

    Args:
        path: Pathfile to save a local copy of the MedNist dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        FedbiomedDatasetManagerError: One of the following cases:

            - tarfile cannot be downloaded
            - downloaded tarfile cannot
                be extracted
            - MedNIST path is empty
            - one of the classes path is empty

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
    """
    download_path = os.path.join(path, 'MedNIST')
    if not os.path.isdir(download_path):
        url = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz"
        filepath = os.path.join(path, 'MedNIST.tar.gz')
        try:
            logger.info("Now downloading MEDNIST...")
            urlretrieve(url, filepath)
            with tarfile.open(filepath) as tar_file:
                logger.info("Now extracting MEDNIST...")
                tar_file.extractall(path)
            os.remove(filepath)

        except (URLError, HTTPError, ContentTooShortError, OSError, tarfile.TarError,
                MemoryError) as e:
            _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while downloading MedNIST dataset"\
                + "from the MONAI repo:  " + str(e)
            logger.error(_msg)
            raise FedbiomedDatasetManagerError(_msg)

    try:
        dataset = datasets.ImageFolder(download_path,
                                       transform=transforms.ToTensor())

    except (FileNotFoundError, RuntimeError) as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset from"\
            "the selected path:  " + str(e) + "\nPlease make sure that the selected MedNIST folder is not empty \
               or choose another path."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    except Exception as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset" + str(e)
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)

modify_database_info(dataset_id, modified_dataset)

Modifies a dataset in the database.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	ID of the dataset to modify.	required
`modified_dataset`	`dict`	New dataset description to replace the existing one.	required

Raises:

Type	Description
`FedbiomedDatasetManagerError`	conflicting tags with existing dataset

Source code in fedbiomed/node/dataset_manager.pydef modify_database_info(self,
                         dataset_id: str,
                         modified_dataset: dict):
    """Modifies a dataset in the database.

    Args:
        dataset_id: ID of the dataset to modify.
        modified_dataset: New dataset description to replace the existing one.

    Raises:
        FedbiomedDatasetManagerError: conflicting tags with existing dataset
    """
    # Check that there are not existing dataset with conflicting tags
    if 'tags' in modified_dataset:
        conflicting = self.search_conflicting_tags(modified_dataset['tags'])

        conflicting_ids = [ c['dataset_id'] for c in conflicting ]
        # the dataset to modify is ignored (can conflict with its previous tags)
        if dataset_id in conflicting_ids:
            conflicting_ids.remove(dataset_id)

        if len(conflicting_ids) > 0:
            msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
                f" {' '.join([ c['name'] for c in conflicting if c['dataset_id'] != dataset_id ])}"
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

    self._dataset_table.update(modified_dataset, self._database.dataset_id == dataset_id)

obfuscate_private_information(database_metadata)

staticmethod

Remove privacy-sensitive information, to prepare for sharing with a researcher.

Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.

Parameters:

Name	Type	Description	Default
`database_metadata`	`Iterable[dict]`	an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict.	required

Returns:

Type	Description
`Iterable[dict]`	the updated iterable of metadata information objects without privacy-sensitive information

Source code in fedbiomed/node/dataset_manager.py@staticmethod
def obfuscate_private_information(database_metadata: Iterable[dict]) -> Iterable[dict]:
    """Remove privacy-sensitive information, to prepare for sharing with a researcher.

    Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
    prevent sharing this information with a researcher through a reply message.

    Args:
        database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
            should be in the format af key-value pairs, such as e.g. a dict.
    Returns:
         the updated iterable of metadata information objects without privacy-sensitive information
    """
    for d in database_metadata:
        try:
            # common obfuscations
            d.pop('path', None)
            # obfuscations specific for each data type
            if 'data_type' in d:
                if d['data_type'] == 'medical-folder':
                    if 'dataset_parameters' in d:
                        d['dataset_parameters'].pop('tabular_file', None)
        except AttributeError:
            raise FedbiomedDatasetManagerError(f"Object of type {type(d)} does not support pop or getitem method "
                                               f"in obfuscate_private_information.")
    return database_metadata

read_csv(csv_file, index_col=None)

Gets content of a CSV file.

Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.

Parameters:

Name	Type	Description	Default
`csv_file`	`str`	File name / path	required
`index_col`	`Union[int, None]`	Column that contains CSV file index. Defaults to None.	`None`

Returns:

Type	Description
`pd.DataFrame`	Pandas DataFrame with data contained in CSV file.

Source code in fedbiomed/node/dataset_manager.pydef read_csv(self, csv_file: str, index_col: Union[int, None] = None) -> pd.DataFrame:
    """Gets content of a CSV file.

    Reads a *.csv file and outputs its data into a pandas DataFrame.
    Finds automatically the CSV delimiter by parsing the first line.

    Args:
        csv_file: File name / path
        index_col: Column that contains CSV file index.
            Defaults to None.

    Returns:
        Pandas DataFrame with data contained in CSV file.
    """

    # Automatically identify separator and header
    sniffer = csv.Sniffer()
    with open(csv_file, 'r') as file:
        delimiter = sniffer.sniff(file.readline()).delimiter
        file.seek(0)
        header = 0 if sniffer.has_header(file.read()) else None

    return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)

remove_database(tags)

Removes datasets from database.

Only datasets matching the tags should be removed.

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	Dataset description tags.	required

Source code in fedbiomed/node/dataset_manager.pydef remove_database(self, tags: Union[tuple, list]):
    """Removes datasets from database.

    Only datasets matching the `tags` should be removed.

    Args:
        tags: Dataset description tags.
    """
    doc_ids = [doc.doc_id for doc in self.search_by_tags(tags)]
    self._dataset_table.remove(doc_ids=doc_ids)

remove_dlp_by_id(dlp_id)

Removes a data loading plan (DLP) from the database.

Only DLP with matching ID is removed from the database. There should be at most one.

If remove_dlbs is True, also remove the attached DLBs. You should ensure they are not used by another DLP, no verification is made.

Parameters:

Name	Type	Description	Default
`dlp_id`	`str`	the DataLoadingPlan id	required

Source code in fedbiomed/node/dataset_manager.pydef remove_dlp_by_id(self, dlp_id: str):
    """Removes a data loading plan (DLP) from the database.

    Only DLP with matching ID is removed from the database. There should be at most one.

    If `remove_dlbs` is True, also remove the attached DLBs. You should ensure
    they are not used by another DLP, no verification is made.

    Args:
        dlp_id: the DataLoadingPlan id
    """
    if not isinstance(dlp_id, str):
        _msg = ErrorNumbers.FB316.value + f": Bad type for dlp '{type(dlp_id)}', expecting str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)
    if not str:
        _msg = ErrorNumbers.FB316.value + ": Bad value for dlp, expecting non empty str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _ , dlbs = self.get_dlp_by_id(dlp_id)
    try:
        self._dlp_table.remove(self._database.dlp_id == dlp_id)
        for dlb in dlbs:
            self._dlp_table.remove(self._database.dlb_id == dlb['dlb_id'])
    except Exception as e:
        _msg = ErrorNumbers.FB316.value + f": Error during remove of DLP {dlp_id}: {e}"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

save_data_loading_block(dlb)

Source code in fedbiomed/node/dataset_manager.pydef save_data_loading_block(self, dlb: DataLoadingBlock) -> None:
    # seems unused
    self._dlp_table.insert(dlb.serialize())

save_data_loading_plan(data_loading_plan)

Save a DataLoadingPlan to the database.

This function saves a DataLoadingPlan to the database, and returns its ID.

Raises:

Type	Description
`FedbiomedDatasetManagerError`	bad data loading plan name (size, not unique)

Parameters:

Name	Type	Description	Default
`data_loading_plan`	`Optional[DataLoadingPlan]`	the DataLoadingPlan to be saved, or None.	required

Returns:

Type	Description
`dict`	The `dlp_id` if a DLP was saved, or None

Source code in fedbiomed/node/dataset_manager.pydef save_data_loading_plan(self,
                           data_loading_plan: Optional[DataLoadingPlan]
                           ) -> dict:
    """Save a DataLoadingPlan to the database.

    This function saves a DataLoadingPlan to the database, and returns its ID.

    Raises:
        FedbiomedDatasetManagerError: bad data loading plan name (size, not unique)

    Args:
        data_loading_plan: the DataLoadingPlan to be saved, or None.

    Returns:
        The `dlp_id` if a DLP was saved, or None
    """
    if data_loading_plan is None:
        return None

    if len(data_loading_plan.desc) < 4:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to have at least 4 characters."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _dlp_same_name = self._dlp_table.search(
        (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()) &
        (self._database.dlp_name == data_loading_plan.desc))
    if _dlp_same_name:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to be unique."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    dlp_metadata, loading_blocks_metadata = data_loading_plan.serialize()
    self._dlp_table.insert(dlp_metadata)
    self._dlp_table.insert_multiple(loading_blocks_metadata)
    return data_loading_plan.dlp_id

search_by_tags(tags)

Searches for data with given tags.

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	List of tags	required

Returns:

Type	Description
`list`	The list of matching datasets

Source code in fedbiomed/node/dataset_manager.pydef search_by_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for data with given tags.

    Args:
        tags:  List of tags

    Returns:
        The list of matching datasets
    """
    return self._dataset_table.search(self._database.tags.all(tags))

search_conflicting_tags(tags)

Searches for registered data that have conflicting tags with the given tags

Parameters:

Name	Type	Description	Default
`tags`	`Union[tuple, list]`	List of tags	required

Returns:

Type	Description
`list`	The list of conflicting datasets

Source code in fedbiomed/node/dataset_manager.pydef search_conflicting_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for registered data that have conflicting tags with the given tags

    Args:
        tags:  List of tags

    Returns:
        The list of conflicting datasets
    """
    def _conflicting_tags(val):
        return all(t in val for t in tags) or all(t in tags for t in val)


    return self._dataset_table.search(self._database.tags.test(_conflicting_tags))

fedbiomed.node.dataset_manager

Classes

DatasetManager

Functions

`add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)`

`get_by_id(dataset_id)`

`get_csv_data_types(dataset)`

`get_data_loading_blocks_by_ids(dlb_ids)`

`get_dlp_by_id(dlp_id)`

`get_torch_dataset_shape(dataset)`

`list_dlp(target_dataset_type=None)`

`list_my_data(verbose=True)`

`load_as_dataloader(dataset)`

`load_csv_dataset(path)`

`load_data(tags, mode)`

`load_default_database(name, path, as_dataset=False)`

`load_images_dataset(folder_path, as_dataset=False)`

`load_mednist_database(path, as_dataset=False)`

`modify_database_info(dataset_id, modified_dataset)`

`obfuscate_private_information(database_metadata)`
`staticmethod`

`read_csv(csv_file, index_col=None)`

`remove_database(tags)`

`remove_dlp_by_id(dlp_id)`

`save_data_loading_block(dlb)`

`save_data_loading_plan(data_loading_plan)`

`search_by_tags(tags)`

`search_conflicting_tags(tags)`

fedbiomed.node.dataset_manager

Classes

DatasetManager

Functions

add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)

get_by_id(dataset_id)

get_csv_data_types(dataset)

get_data_loading_blocks_by_ids(dlb_ids)

get_dlp_by_id(dlp_id)

get_torch_dataset_shape(dataset)

list_dlp(target_dataset_type=None)

list_my_data(verbose=True)

load_as_dataloader(dataset)

load_csv_dataset(path)

load_data(tags, mode)

load_default_database(name, path, as_dataset=False)

load_images_dataset(folder_path, as_dataset=False)

load_mednist_database(path, as_dataset=False)

modify_database_info(dataset_id, modified_dataset)

obfuscate_private_information(database_metadata) staticmethod

read_csv(csv_file, index_col=None)

remove_database(tags)

remove_dlp_by_id(dlp_id)

save_data_loading_block(dlb)

save_data_loading_plan(data_loading_plan)

search_by_tags(tags)

search_conflicting_tags(tags)

`add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)`

`get_by_id(dataset_id)`

`get_csv_data_types(dataset)`

`get_data_loading_blocks_by_ids(dlb_ids)`

`get_dlp_by_id(dlp_id)`

`get_torch_dataset_shape(dataset)`

`list_dlp(target_dataset_type=None)`

`list_my_data(verbose=True)`

`load_as_dataloader(dataset)`

`load_csv_dataset(path)`

`load_data(tags, mode)`

`load_default_database(name, path, as_dataset=False)`

`load_images_dataset(folder_path, as_dataset=False)`

`load_mednist_database(path, as_dataset=False)`

`modify_database_info(dataset_id, modified_dataset)`

`obfuscate_private_information(database_metadata)`
`staticmethod`

`read_csv(csv_file, index_col=None)`

`remove_database(tags)`

`remove_dlp_by_id(dlp_id)`

`save_data_loading_block(dlb)`

`save_data_loading_plan(data_loading_plan)`

`search_by_tags(tags)`

`search_conflicting_tags(tags)`