fedbiomed.node.dataset_manager

Module: fedbiomed.node.dataset_manager

Interfaces with the node component database.

Classes

DatasetManager

CLASS
DatasetManager()

Interfaces with the node component database.

Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.

Source code in fedbiomed/node/dataset_manager.py
def __init__(self):
    """Constructor of the class.
    """
    self._db = TinyDB(environ['DB_PATH'])
    self._database = Query()

    # don't use DB read cache to ensure coherence
    # (eg when mixing CLI commands with a GUI session)
    self._dataset_table = self._db.table(name='Datasets', cache_size=0)
    self._dlp_table = self._db.table(name='Data_Loading_Plans', cache_size=0)

Functions

add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)

Adds a new dataset contained in a file to node's database.

Parameters:

Name Type Description Default
name str

Name of the dataset

required
data_type str

File extension/format of the dataset (*.csv, images, ...)

required
tags Union[tuple, list]

Tags of the dataset.

required
description str

Human readable description of the dataset.

required
path Optional[str]

Path to the dataset. Defaults to None.

None
dataset_id Optional[str]

Id of the dataset. Defaults to None.

None
dataset_parameters Optional[dict]

a dictionary of additional (customized) parameters, or None

None
data_loading_plan Optional[DataLoadingPlan]

a DataLoadingPlan to be linked to this dataset, or None

None
save_dlp bool

if True, save the data_loading_plan

True

Raises:

Type Description
NotImplementedError

data_type is not supported.

FedbiomedDatasetManagerError

path does not exist or dataset was not saved properly.

Source code in fedbiomed/node/dataset_manager.py
def add_database(self,
                 name: str,
                 data_type: str,
                 tags: Union[tuple, list],
                 description: str,
                 path: Optional[str] = None,
                 dataset_id: Optional[str] = None,
                 dataset_parameters : Optional[dict] = None,
                 data_loading_plan: Optional[DataLoadingPlan] = None,
                 save_dlp: bool = True):
    """Adds a new dataset contained in a file to node's database.

    Args:
        name: Name of the dataset
        data_type: File extension/format of the
            dataset (*.csv, images, ...)
        tags: Tags of the dataset.
        description: Human readable description of the dataset.
        path: Path to the dataset. Defaults to None.
        dataset_id: Id of the dataset. Defaults to None.
        dataset_parameters: a dictionary of additional (customized) parameters, or None
        data_loading_plan: a DataLoadingPlan to be linked to this dataset, or None
        save_dlp: if True, save the `data_loading_plan`

    Raises:
        NotImplementedError: `data_type` is not supported.
        FedbiomedDatasetManagerError: path does not exist or dataset was not saved properly.
    """
    # Accept tilde as home folder
    if path is not None:
        path = os.path.expanduser(path)

    # Check that there are not existing dataset with conflicting tags
    conflicting = self.search_conflicting_tags(tags)
    if len(conflicting) > 0:
        msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
            f" {' '.join([ c['name'] for c in conflicting ])}"
        logger.critical(msg)
        raise FedbiomedDatasetManagerError(msg)

    dtypes = []  # empty list for Image datasets
    data_types = ['csv', 'default', 'mednist', 'images', 'medical-folder', 'flamby']

    if data_type not in data_types:
        raise NotImplementedError(f'Data type {data_type} is not'
                                  ' a compatible data type. '
                                  f'Compatible data types are: {data_types}')

    elif data_type == 'flamby':
        # check that data loading plan is present and well formed
        if data_loading_plan is None or \
                FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA not in data_loading_plan:
            msg = f"{ErrorNumbers.FB316.value}. A DataLoadingPlan containing " \
                  f"{FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA.value} is required for adding a FLamby dataset " \
                  f"to the database."
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

        # initialize a dataset and link to the flamby data. If all goes well, compute shape.
        try:
            dataset = FlambyDataset()
            dataset.set_dlp(data_loading_plan)  # initializes fed_class as a side effect
        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create FLamby dataset. {e}")
        else:
            shape = dataset.shape()

    if data_type == 'default':
        assert os.path.isdir(path), f'Folder {path} for Default Dataset does not exist.'
        shape = self.load_default_database(name, path)

    elif data_type == 'mednist':
        assert os.path.isdir(path), f'Folder {path} for MedNIST Dataset does not exist.'
        shape = self.load_mednist_database(path)
        path = os.path.join(path, 'MedNIST')

    elif data_type == 'csv':
        assert os.path.isfile(path), f'Path provided ({path}) does not correspond to a CSV file.'
        dataset = self.load_csv_dataset(path)
        shape = dataset.shape
        dtypes = self.get_csv_data_types(dataset)

    elif data_type == 'images':
        assert os.path.isdir(path), f'Folder {path} for Images Dataset does not exist.'
        shape = self.load_images_dataset(path)

    elif data_type == 'medical-folder':
        if not os.path.isdir(path):
            raise FedbiomedDatasetManagerError(f'Folder {path} for Medical Folder Dataset does not exist.')

        if "tabular_file" not in dataset_parameters:
            logger.info("Medical Folder Dataset will be loaded without reference/demographics data.")
        else:
            if not os.path.isfile(dataset_parameters['tabular_file']):
                raise FedbiomedDatasetManagerError(f'Path {dataset_parameters["tabular_file"]} does not '
                                                   f'correspond a file.')
            if "index_col" not in dataset_parameters:
                raise FedbiomedDatasetManagerError('Index column is not provided')

        try:
            # load using the MedicalFolderController to ensure all available modalities are inspected
            controller = MedicalFolderController(root=path)
            if data_loading_plan is not None:
                controller.set_dlp(data_loading_plan)
            dataset = controller.load_MedicalFolder(tabular_file=dataset_parameters.get('tabular_file', None),
                                                    index_col=dataset_parameters.get('index_col', None))

        except FedbiomedError as e:
            raise FedbiomedDatasetManagerError(f"Can not create Medical Folder dataset. {e}")
        else:
            shape = dataset.shape()

        # try to read one sample and raise if it doesn't work
        try:
            _ = dataset.get_nontransformed_item(0)
        except Exception as e:
            raise FedbiomedDatasetManagerError(f'Medical Folder Dataset was not saved properly and '
                                               f'cannot be read. {e}')

    if not dataset_id:
        dataset_id = 'dataset_' + str(uuid.uuid4())

    new_database = dict(name=name, data_type=data_type, tags=tags,
                        description=description, shape=shape,
                        path=path, dataset_id=dataset_id, dtypes=dtypes,
                        dataset_parameters=dataset_parameters)
    if save_dlp:
        dlp_id = self.save_data_loading_plan(data_loading_plan)
    elif isinstance(data_loading_plan, DataLoadingPlan):
        dlp_id = data_loading_plan.dlp_id
    else:
        dlp_id = None
    if dlp_id is not None:
        new_database['dlp_id'] = dlp_id
    self._dataset_table.insert(new_database)

    return dataset_id
get_by_id(dataset_id)

Searches for a dataset with given dataset_id.

Parameters:

Name Type Description Default
dataset_id str

A dataset id

required

Returns:

Type Description
Union[dict, None]

A dict containing the dataset's description if a dataset with this dataset_id

Union[dict, None]

exists in the database. None if no such dataset exists in the database.

Source code in fedbiomed/node/dataset_manager.py
def get_by_id(self, dataset_id: str) -> Union[dict, None]:
    """Searches for a dataset with given dataset_id.

    Args:
        dataset_id:  A dataset id

    Returns:
        A `dict` containing the dataset's description if a dataset with this `dataset_id`
        exists in the database. `None` if no such dataset exists in the database. 
    """
    result = self._dataset_table.get(self._database.dataset_id == dataset_id)

    return result
get_csv_data_types(dataset)

Gets data types of each variable in dataset.

Parameters:

Name Type Description Default
dataset pd.DataFrame

A Pandas dataset.

required

Returns:

Type Description
List[str]

A list of strings containing data types.

Source code in fedbiomed/node/dataset_manager.py
def get_csv_data_types(self, dataset: pd.DataFrame) -> List[str]:
    """Gets data types of each variable in dataset.

    Args:
        dataset: A Pandas dataset.

    Returns:
        A list of strings containing data types.
    """
    types = [str(t) for t in dataset.dtypes]

    return types
get_data_loading_blocks_by_ids(dlb_ids)

Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

Parameters:

Name Type Description Default
dlb_ids List[str]

(List[str]) a list of DataLoadingBlock IDs

required

Returns:

Type Description
List[dict]

A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.

Source code in fedbiomed/node/dataset_manager.py
def get_data_loading_blocks_by_ids(self, dlb_ids: List[str]) -> List[dict]:
    """Search for a list of DataLoadingBlockTypes, each corresponding to one given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.

    Args:
        dlb_ids: (List[str]) a list of DataLoadingBlock IDs

    Returns:
        A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.
    """
    return self._dlp_table.search(self._database.dlb_id.one_of(dlb_ids))
get_dlp_by_id(dlp_id)

Search for a DataLoadingPlan with a given id.

Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.

DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

Parameters:

Name Type Description Default
dlp_id str

(str) the DataLoadingPlan id

required

Returns:

Type Description
Tuple[dict, List[dict]]

A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.

Source code in fedbiomed/node/dataset_manager.py
def get_dlp_by_id(self, dlp_id: str) -> Tuple[dict, List[dict]]:
    """Search for a DataLoadingPlan with a given id.

    Note that in case of conflicting ids (which should not happen), this function will silently return a random
    one with the sought id.

    DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.

    Args:
        dlp_id: (str) the DataLoadingPlan id

    Returns:
        A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.
    """
    dlp_metadata = self._dlp_table.get(self._database.dlp_id == dlp_id)
    return dlp_metadata, self._dlp_table.search(
        self._database.dlb_id.one_of(dlp_metadata['loading_blocks'].values()))
get_torch_dataset_shape(dataset)

Gets info about dataset shape.

Parameters:

Name Type Description Default
dataset torch.utils.data.Dataset

A Pytorch dataset

required

Returns:

Type Description
List[int]

A list of int containing [, ]. Example for MNIST: [60000, 1, 28, 28], where =60000 and =1, 28, 28

Source code in fedbiomed/node/dataset_manager.py
def get_torch_dataset_shape(self, dataset: torch.utils.data.Dataset) -> List[int]:
    """Gets info about dataset shape.

    Args:
        dataset: A Pytorch dataset

    Returns:
        A list of int containing
            [<nb_of_data>, <dimension_of_first_input_data>].
            Example for MNIST: [60000, 1, 28, 28], where <nb_of_data>=60000
            and <dimension_of_first_input_data>=1, 28, 28
    """
    return [len(dataset)] + list(dataset[0][0].shape)
list_dlp(target_dataset_type=None)

Return all existing DataLoadingPlans.

Parameters:

Name Type Description Default
target_dataset_type Optional[str]

(str or None) if specified, return only dlps matching the requested target type.

None

Returns:

Type Description
List[dict]

An array of dict, each dict is a DataLoadingPlan

Source code in fedbiomed/node/dataset_manager.py
def list_dlp(self, target_dataset_type: Optional[str] = None) -> List[dict]:
    """Return all existing DataLoadingPlans.

    Args:
        target_dataset_type: (str or None) if specified, return only dlps matching the requested target type.

    Returns:
        An array of dict, each dict is a DataLoadingPlan
    """
    if target_dataset_type is not None:
        if not isinstance(target_dataset_type, str):
            raise FedbiomedDatasetManagerError(f"Wrong input type for target_dataset_type. "
                                               f"Expected str, got {type(target_dataset_type)} instead.")
        if target_dataset_type not in [t.value for t in DatasetTypes]:
            raise FedbiomedDatasetManagerError("target_dataset_type should be of the values defined in "
                                               "fedbiomed.common.constants.DatasetTypes")

        dlps = self._dlp_table.search(
            (self._database.dlp_id.exists()) &
            (self._database.dlp_name.exists()) &
            (self._database.target_dataset_type == target_dataset_type))
    else:
        dlps = self._dlp_table.search(
            (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()))
    return [dict(dlp) for dlp in dlps]
list_my_data(verbose=True)

Lists all datasets on the node.

Parameters:

Name Type Description Default
verbose bool

Give verbose output. Defaults to True.

True

Returns:

Type Description
List[dict]

All datasets in the node's database.

Source code in fedbiomed/node/dataset_manager.py
def list_my_data(self, verbose: bool = True) -> List[dict]:
    """Lists all datasets on the node.

    Args:
        verbose: Give verbose output. Defaults to True.

    Returns:
        All datasets in the node's database.
    """
    my_data = self._dataset_table.all()

    # Do not display dtypes
    for doc in my_data:
        doc.pop('dtypes')

    if verbose:
        print(tabulate(my_data, headers='keys'))

    return my_data
load_as_dataloader(dataset)

Loads content of an image dataset.

Parameters:

Name Type Description Default
dataset dict

Description of the dataset.

required

Returns:

Type Description
torch.utils.data.Dataset

Content of the dataset.

Source code in fedbiomed/node/dataset_manager.py
def load_as_dataloader(self, dataset: dict) -> torch.utils.data.Dataset:
    """Loads content of an image dataset.

    Args:
        dataset: Description of the dataset.

    Returns:
        Content of the dataset.
    """
    name = dataset['data_type']
    if name == 'default':
        return self.load_default_database(name=dataset['name'],
                                          path=dataset['path'],
                                          as_dataset=True)
    elif name == 'images':
        return self.load_images_dataset(folder_path=dataset['path'],
                                        as_dataset=True)
load_csv_dataset(path)

Loads a CSV dataset.

Parameters:

Name Type Description Default
path str

Path to the CSV file.

required

Returns:

Type Description
pd.DataFrame

Pandas DataFrame with the content of the file.

Source code in fedbiomed/node/dataset_manager.py
def load_csv_dataset(self, path: str) -> pd.DataFrame:
    """Loads a CSV dataset.

    Args:
        path: Path to the CSV file.

    Returns:
        Pandas DataFrame with the content of the file.
    """
    return self.read_csv(path)
load_data(tags, mode)

Loads content of a dataset.

Parameters:

Name Type Description Default
tags Union[tuple, list]

Tags describing the dataset to load.

required
mode str

Return format for the dataset content.

required

Raises:

Type Description
NotImplementedError

mode is not implemented yet.

Returns:

Type Description
Any

Content of the dataset. Its type depends on the mode and dataset.

Source code in fedbiomed/node/dataset_manager.py
def load_data(self, tags: Union[tuple, list], mode: str) -> Any:
    """Loads content of a dataset.

    Args:
        tags: Tags describing the dataset to load.
        mode: Return format for the dataset content.

    Raises:
        NotImplementedError: `mode` is not implemented yet.

    Returns:
        Content of the dataset. Its type depends on the `mode` and dataset.
    """

    # Verify is mode is available
    mode = mode.lower()
    modes = ['pandas', 'torch_dataset', 'torch_tensor', 'numpy']
    if mode not in modes:
        raise NotImplementedError(f'Data mode `{mode}` was not found.'
                                  f' Data modes available: {modes}')

    # Look for dataset in database
    dataset = self.search_by_tags(tags)[0]
    print(dataset)
    assert len(dataset) > 0, f'Dataset with tags {tags} was not found.'

    dataset_path = dataset['path']
    # If path is a file, you will aim to read it with
    if os.path.isfile(dataset_path):
        df = self.read_csv(dataset_path, index_col=0)

        # Load data as requested
        if mode == 'pandas':
            return df
        elif mode == 'numpy':
            return df._get_numeric_data().values
        elif mode == 'torch_tensor':
            return torch.from_numpy(df._get_numeric_data().values)

    elif os.path.isdir(dataset_path):
        if mode == 'torch_dataset':
            return self.load_as_dataloader(dataset)
        elif mode == 'torch_tensor':
            raise NotImplementedError('We are working on this'
                                      ' implementation!')
        elif mode == 'numpy':
            raise NotImplementedError('We are working on this'
                                      'implementation!')
        else:
            raise NotImplementedError(f'Mode `{mode}` has not been'
                                      ' implemented on this version.')
load_default_database(name, path, as_dataset=False)

Loads a default dataset.

Currently, only MNIST dataset is used as the default dataset.

Parameters:

Name Type Description Default
name str

Name of the default dataset. Currently, only MNIST is accepted.

required
path str

Pathfile to MNIST dataset.

required
as_dataset bool

Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.

False

Raises:

Type Description
NotImplementedError

Name is not matching with the name of a default dataset.

Returns:

Type Description
Union[List[int], torch.utils.data.Dataset]

Depends on the value of the parameter as_dataset: If

Union[List[int], torch.utils.data.Dataset]

set to True, returns dataset (type: torch.utils.data.Dataset).

Union[List[int], torch.utils.data.Dataset]

If set to False, returns the size of the dataset stored inside

Union[List[int], torch.utils.data.Dataset]

a list (type: List[int]).

Source code in fedbiomed/node/dataset_manager.py
def load_default_database(self,
                          name: str,
                          path: str,
                          as_dataset: bool = False) -> Union[List[int],
                                                             torch.utils.data.Dataset]:
    """Loads a default dataset.

    Currently, only MNIST dataset is used as the default dataset.

    Args:
        name: Name of the default dataset. Currently,
            only MNIST is accepted.
        path: Pathfile to MNIST dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        NotImplementedError: Name is not matching with
            the name of a default dataset.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int]).
    """
    kwargs = dict(root=path, download=True, transform=transforms.ToTensor())

    if 'mnist' in name.lower():
        dataset = datasets.MNIST(**kwargs)
    else:
        raise NotImplementedError(f'Default dataset `{name}` has'
                                  'not been implemented.')
    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)
load_images_dataset(folder_path, as_dataset=False)

Loads an image dataset.

Parameters:

Name Type Description Default
folder_path str

Path to the directory containing the images.

required
as_dataset bool

Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.

False

Returns:

Type Description
Union[List[int], torch.utils.data.Dataset]

Depends on the value of the parameter as_dataset: If

Union[List[int], torch.utils.data.Dataset]

set to True, returns dataset (type: torch.utils.data.Dataset).

Union[List[int], torch.utils.data.Dataset]

If set to False, returns the size of the dataset stored inside

Union[List[int], torch.utils.data.Dataset]

a list (type: List[int])

Source code in fedbiomed/node/dataset_manager.py
def load_images_dataset(self,
                        folder_path: str,
                        as_dataset: bool = False) -> Union[List[int],
                                                           torch.utils.data.Dataset]:
    """Loads an image dataset.

    Args:
        folder_path: Path to the directory containing the images.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
    """
    try:
        dataset = datasets.ImageFolder(folder_path,
                                       transform=transforms.ToTensor())
    except Exception as e:
        _msg = ErrorNumbers.FB315.value +\
            "\nThe following error was raised while loading dataset from the selected" \
            " path:  " + str(e) + "\nPlease make sure that the selected folder is not empty \
            and doesn't have any empty class folder"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)
load_mednist_database(path, as_dataset=False)

Loads the MedNist dataset.

Parameters:

Name Type Description Default
path str

Pathfile to save a local copy of the MedNist dataset.

required
as_dataset bool

Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False.

False

Raises:

Type Description
FedbiomedDatasetManagerError

One of the following cases:

  • tarfile cannot be downloaded
  • downloaded tarfile cannot be extracted
  • MedNIST path is empty
  • one of the classes path is empty

Returns:

Type Description
Union[List[int], torch.utils.data.Dataset]

Depends on the value of the parameter as_dataset: If

Union[List[int], torch.utils.data.Dataset]

set to True, returns dataset (type: torch.utils.data.Dataset).

Union[List[int], torch.utils.data.Dataset]

If set to False, returns the size of the dataset stored inside

Union[List[int], torch.utils.data.Dataset]

a list (type: List[int])

Source code in fedbiomed/node/dataset_manager.py
def load_mednist_database(self,
                          path: str,
                          as_dataset: bool = False) -> Union[List[int],
                                                             torch.utils.data.Dataset]:
    """Loads the MedNist dataset.

    Args:
        path: Pathfile to save a local copy of the MedNist dataset.
        as_dataset: Whether to return
            the complete dataset (True) or dataset dimensions (False).
            Defaults to False.

    Raises:
        FedbiomedDatasetManagerError: One of the following cases:

            - tarfile cannot be downloaded
            - downloaded tarfile cannot
                be extracted
            - MedNIST path is empty
            - one of the classes path is empty

    Returns:
        Depends on the value of the parameter `as_dataset`: If
        set to True,  returns dataset (type: torch.utils.data.Dataset).
        If set to False, returns the size of the dataset stored inside
        a list (type: List[int])
    """
    download_path = os.path.join(path, 'MedNIST')
    if not os.path.isdir(download_path):
        url = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz"
        filepath = os.path.join(path, 'MedNIST.tar.gz')
        try:
            logger.info("Now downloading MEDNIST...")
            urlretrieve(url, filepath)
            with tarfile.open(filepath) as tar_file:
                logger.info("Now extracting MEDNIST...")
                tar_file.extractall(path)
            os.remove(filepath)

        except (URLError, HTTPError, ContentTooShortError, OSError, tarfile.TarError,
                MemoryError) as e:
            _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while downloading MedNIST dataset"\
                + "from the MONAI repo:  " + str(e)
            logger.error(_msg)
            raise FedbiomedDatasetManagerError(_msg)

    try:
        dataset = datasets.ImageFolder(download_path,
                                       transform=transforms.ToTensor())

    except (FileNotFoundError, RuntimeError) as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset from"\
            "the selected path:  " + str(e) + "\nPlease make sure that the selected MedNIST folder is not empty \
               or choose another path."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    except Exception as e:
        _msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset" + str(e)
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    if as_dataset:
        return dataset
    else:
        return self.get_torch_dataset_shape(dataset)
modify_database_info(dataset_id, modified_dataset)

Modifies a dataset in the database.

Parameters:

Name Type Description Default
dataset_id str

ID of the dataset to modify.

required
modified_dataset dict

New dataset description to replace the existing one.

required

Raises:

Type Description
FedbiomedDatasetManagerError

conflicting tags with existing dataset

Source code in fedbiomed/node/dataset_manager.py
def modify_database_info(self,
                         dataset_id: str,
                         modified_dataset: dict):
    """Modifies a dataset in the database.

    Args:
        dataset_id: ID of the dataset to modify.
        modified_dataset: New dataset description to replace the existing one.

    Raises:
        FedbiomedDatasetManagerError: conflicting tags with existing dataset
    """
    # Check that there are not existing dataset with conflicting tags
    if 'tags' in modified_dataset:
        conflicting = self.search_conflicting_tags(modified_dataset['tags'])

        conflicting_ids = [ c['dataset_id'] for c in conflicting ]
        # the dataset to modify is ignored (can conflict with its previous tags)
        if dataset_id in conflicting_ids:
            conflicting_ids.remove(dataset_id)

        if len(conflicting_ids) > 0:
            msg = f"{ErrorNumbers.FB322.value}, one or more registered dataset has conflicting tags: " \
                f" {' '.join([ c['name'] for c in conflicting if c['dataset_id'] != dataset_id ])}"
            logger.critical(msg)
            raise FedbiomedDatasetManagerError(msg)

    self._dataset_table.update(modified_dataset, self._database.dataset_id == dataset_id)
obfuscate_private_information(database_metadata)
staticmethod

Remove privacy-sensitive information, to prepare for sharing with a researcher.

Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.

Parameters:

Name Type Description Default
database_metadata Iterable[dict]

an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict.

required

Returns:

Type Description
Iterable[dict]

the updated iterable of metadata information objects without privacy-sensitive information

Source code in fedbiomed/node/dataset_manager.py
@staticmethod
def obfuscate_private_information(database_metadata: Iterable[dict]) -> Iterable[dict]:
    """Remove privacy-sensitive information, to prepare for sharing with a researcher.

    Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
    prevent sharing this information with a researcher through a reply message.

    Args:
        database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
            should be in the format af key-value pairs, such as e.g. a dict.
    Returns:
         the updated iterable of metadata information objects without privacy-sensitive information
    """
    for d in database_metadata:
        try:
            # common obfuscations
            d.pop('path', None)
            # obfuscations specific for each data type
            if 'data_type' in d:
                if d['data_type'] == 'medical-folder':
                    if 'dataset_parameters' in d:
                        d['dataset_parameters'].pop('tabular_file', None)
        except AttributeError:
            raise FedbiomedDatasetManagerError(f"Object of type {type(d)} does not support pop or getitem method "
                                               f"in obfuscate_private_information.")
    return database_metadata
read_csv(csv_file, index_col=None)

Gets content of a CSV file.

Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.

Parameters:

Name Type Description Default
csv_file str

File name / path

required
index_col Union[int, None]

Column that contains CSV file index. Defaults to None.

None

Returns:

Type Description
pd.DataFrame

Pandas DataFrame with data contained in CSV file.

Source code in fedbiomed/node/dataset_manager.py
def read_csv(self, csv_file: str, index_col: Union[int, None] = None) -> pd.DataFrame:
    """Gets content of a CSV file.

    Reads a *.csv file and outputs its data into a pandas DataFrame.
    Finds automatically the CSV delimiter by parsing the first line.

    Args:
        csv_file: File name / path
        index_col: Column that contains CSV file index.
            Defaults to None.

    Returns:
        Pandas DataFrame with data contained in CSV file.
    """

    # Automatically identify separator and header
    sniffer = csv.Sniffer()
    with open(csv_file, 'r') as file:
        delimiter = sniffer.sniff(file.readline()).delimiter
        file.seek(0)
        header = 0 if sniffer.has_header(file.read()) else None

    return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)
remove_database(tags)

Removes datasets from database.

Only datasets matching the tags should be removed.

Parameters:

Name Type Description Default
tags Union[tuple, list]

Dataset description tags.

required
Source code in fedbiomed/node/dataset_manager.py
def remove_database(self, tags: Union[tuple, list]):
    """Removes datasets from database.

    Only datasets matching the `tags` should be removed.

    Args:
        tags: Dataset description tags.
    """
    doc_ids = [doc.doc_id for doc in self.search_by_tags(tags)]
    self._dataset_table.remove(doc_ids=doc_ids)
remove_dlp_by_id(dlp_id)

Removes a data loading plan (DLP) from the database.

Only DLP with matching ID is removed from the database. There should be at most one.

If remove_dlbs is True, also remove the attached DLBs. You should ensure they are not used by another DLP, no verification is made.

Parameters:

Name Type Description Default
dlp_id str

the DataLoadingPlan id

required
Source code in fedbiomed/node/dataset_manager.py
def remove_dlp_by_id(self, dlp_id: str):
    """Removes a data loading plan (DLP) from the database.

    Only DLP with matching ID is removed from the database. There should be at most one.

    If `remove_dlbs` is True, also remove the attached DLBs. You should ensure
    they are not used by another DLP, no verification is made.

    Args:
        dlp_id: the DataLoadingPlan id
    """
    if not isinstance(dlp_id, str):
        _msg = ErrorNumbers.FB316.value + f": Bad type for dlp '{type(dlp_id)}', expecting str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)
    if not str:
        _msg = ErrorNumbers.FB316.value + ": Bad value for dlp, expecting non empty str"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _ , dlbs = self.get_dlp_by_id(dlp_id)
    try:
        self._dlp_table.remove(self._database.dlp_id == dlp_id)
        for dlb in dlbs:
            self._dlp_table.remove(self._database.dlb_id == dlb['dlb_id'])
    except Exception as e:
        _msg = ErrorNumbers.FB316.value + f": Error during remove of DLP {dlp_id}: {e}"
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)
save_data_loading_block(dlb)
Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_block(self, dlb: DataLoadingBlock) -> None:
    # seems unused
    self._dlp_table.insert(dlb.serialize())
save_data_loading_plan(data_loading_plan)

Save a DataLoadingPlan to the database.

This function saves a DataLoadingPlan to the database, and returns its ID.

Raises:

Type Description
FedbiomedDatasetManagerError

bad data loading plan name (size, not unique)

Parameters:

Name Type Description Default
data_loading_plan Optional[DataLoadingPlan]

the DataLoadingPlan to be saved, or None.

required

Returns:

Type Description
dict

The dlp_id if a DLP was saved, or None

Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_plan(self,
                           data_loading_plan: Optional[DataLoadingPlan]
                           ) -> dict:
    """Save a DataLoadingPlan to the database.

    This function saves a DataLoadingPlan to the database, and returns its ID.

    Raises:
        FedbiomedDatasetManagerError: bad data loading plan name (size, not unique)

    Args:
        data_loading_plan: the DataLoadingPlan to be saved, or None.

    Returns:
        The `dlp_id` if a DLP was saved, or None
    """
    if data_loading_plan is None:
        return None

    if len(data_loading_plan.desc) < 4:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to have at least 4 characters."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    _dlp_same_name = self._dlp_table.search(
        (self._database.dlp_id.exists()) & (self._database.dlp_name.exists()) &
        (self._database.dlp_name == data_loading_plan.desc))
    if _dlp_same_name:
        _msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
            "DLP name needs to be unique."
        logger.error(_msg)
        raise FedbiomedDatasetManagerError(_msg)

    dlp_metadata, loading_blocks_metadata = data_loading_plan.serialize()
    self._dlp_table.insert(dlp_metadata)
    self._dlp_table.insert_multiple(loading_blocks_metadata)
    return data_loading_plan.dlp_id
search_by_tags(tags)

Searches for data with given tags.

Parameters:

Name Type Description Default
tags Union[tuple, list]

List of tags

required

Returns:

Type Description
list

The list of matching datasets

Source code in fedbiomed/node/dataset_manager.py
def search_by_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for data with given tags.

    Args:
        tags:  List of tags

    Returns:
        The list of matching datasets
    """
    return self._dataset_table.search(self._database.tags.all(tags))
search_conflicting_tags(tags)

Searches for registered data that have conflicting tags with the given tags

Parameters:

Name Type Description Default
tags Union[tuple, list]

List of tags

required

Returns:

Type Description
list

The list of conflicting datasets

Source code in fedbiomed/node/dataset_manager.py
def search_conflicting_tags(self, tags: Union[tuple, list]) -> list:
    """Searches for registered data that have conflicting tags with the given tags

    Args:
        tags:  List of tags

    Returns:
        The list of conflicting datasets
    """
    def _conflicting_tags(val):
        return all(t in val for t in tags) or all(t in tags for t in val)


    return self._dataset_table.search(self._database.tags.test(_conflicting_tags))