fedbiomed.node.dataset_manager
Module:fedbiomed.node.dataset_manager
Interfaces with the node component database.
Classes
DatasetManager
DatasetManager()
Interfaces with the node component database.
Facility for storing data, retrieving data and getting data info for the node. Currently uses TinyDB.
Source code in fedbiomed/node/dataset_manager.py
def __init__(self):
"""Constructor of the class.
"""
self._db = TinyDB(environ['DB_PATH'])
self._database = Query()
# don't use DB read cache to ensure coherence
# (eg when mixing CLI commands with a GUI session)
self._dataset_table = self._db.table(name='Datasets', cache_size=0)
self._dlp_table = self._db.table(name='Data_Loading_Plans', cache_size=0)
Functions
add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)
add_database(name, data_type, tags, description, path=None, dataset_id=None, dataset_parameters=None, data_loading_plan=None, save_dlp=True)
Adds a new dataset contained in a file to node's database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name | str | Name of the dataset | required |
data_type | str | File extension/format of the dataset (*.csv, images, ...) | required |
tags | Union[tuple, list] | Tags of the dataset. | required |
description | str | Human readable description of the dataset. | required |
path | Optional[str] | Path to the dataset. Defaults to None. | None |
dataset_id | Optional[str] | Id of the dataset. Defaults to None. | None |
dataset_parameters | Optional[dict] | a dictionary of additional (customized) parameters, or None | None |
data_loading_plan | Optional[DataLoadingPlan] | a DataLoadingPlan to be linked to this dataset, or None | None |
save_dlp | bool | if True, save the | True |
Raises:
Type | Description |
---|---|
NotImplementedError |
|
FedbiomedDatasetManagerError | path does not exist or dataset was not saved properly. |
Source code in fedbiomed/node/dataset_manager.py
def add_database(self,
name: str,
data_type: str,
tags: Union[tuple, list],
description: str,
path: Optional[str] = None,
dataset_id: Optional[str] = None,
dataset_parameters : Optional[dict] = None,
data_loading_plan: Optional[DataLoadingPlan] = None,
save_dlp: bool = True):
"""Adds a new dataset contained in a file to node's database.
Args:
name: Name of the dataset
data_type: File extension/format of the
dataset (*.csv, images, ...)
tags: Tags of the dataset.
description: Human readable description of the dataset.
path: Path to the dataset. Defaults to None.
dataset_id: Id of the dataset. Defaults to None.
dataset_parameters: a dictionary of additional (customized) parameters, or None
data_loading_plan: a DataLoadingPlan to be linked to this dataset, or None
save_dlp: if True, save the `data_loading_plan`
Raises:
NotImplementedError: `data_type` is not supported.
FedbiomedDatasetManagerError: path does not exist or dataset was not saved properly.
"""
# Accept tilde as home folder
if path is not None:
path = os.path.expanduser(path)
# Check that there are not existing databases with the same name
assert len(self.search_by_tags(tags)) == 0, 'Data tags must be unique'
dtypes = [] # empty list for Image datasets
data_types = ['csv', 'default', 'mednist', 'images', 'medical-folder', 'flamby']
if data_type not in data_types:
raise NotImplementedError(f'Data type {data_type} is not'
' a compatible data type. '
f'Compatible data types are: {data_types}')
elif data_type == 'flamby':
# check that data loading plan is present and well formed
if data_loading_plan is None or \
FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA not in data_loading_plan:
msg = f"{ErrorNumbers.FB316.value}. A DataLoadingPlan containing " \
f"{FlambyLoadingBlockTypes.FLAMBY_DATASET_METADATA.value} is required for adding a FLamby dataset " \
f"to the database."
logger.critical(msg)
raise FedbiomedDatasetManagerError(msg)
# initialize a dataset and link to the flamby data. If all goes well, compute shape.
try:
dataset = FlambyDataset()
dataset.set_dlp(data_loading_plan) # initializes fed_class as a side effect
except FedbiomedError as e:
raise FedbiomedDatasetManagerError(f"Can not create FLamby dataset. {e}")
else:
shape = dataset.shape()
if data_type == 'default':
assert os.path.isdir(path), f'Folder {path} for Default Dataset does not exist.'
shape = self.load_default_database(name, path)
elif data_type == 'mednist':
assert os.path.isdir(path), f'Folder {path} for MedNIST Dataset does not exist.'
shape = self.load_mednist_database(path)
path = os.path.join(path, 'MedNIST')
elif data_type == 'csv':
assert os.path.isfile(path), f'Path provided ({path}) does not correspond to a CSV file.'
dataset = self.load_csv_dataset(path)
shape = dataset.shape
dtypes = self.get_csv_data_types(dataset)
elif data_type == 'images':
assert os.path.isdir(path), f'Folder {path} for Images Dataset does not exist.'
shape = self.load_images_dataset(path)
elif data_type == 'medical-folder':
if not os.path.isdir(path):
raise FedbiomedDatasetManagerError(f'Folder {path} for Medical Folder Dataset does not exist.')
if "tabular_file" not in dataset_parameters:
logger.info("Medical Folder Dataset will be loaded without reference/demographics data.")
else:
if not os.path.isfile(dataset_parameters['tabular_file']):
raise FedbiomedDatasetManagerError(f'Path {dataset_parameters["tabular_file"]} does not '
f'correspond a file.')
if "index_col" not in dataset_parameters:
raise FedbiomedDatasetManagerError('Index column is not provided')
try:
# load using the MedicalFolderController to ensure all available modalities are inspected
controller = MedicalFolderController(root=path)
if data_loading_plan is not None:
controller.set_dlp(data_loading_plan)
dataset = controller.load_MedicalFolder(tabular_file=dataset_parameters.get('tabular_file', None),
index_col=dataset_parameters.get('index_col', None))
except FedbiomedError as e:
raise FedbiomedDatasetManagerError(f"Can not create Medical Folder dataset. {e}")
else:
shape = dataset.shape()
# try to read one sample and raise if it doesn't work
try:
_ = dataset.get_nontransformed_item(0)
except Exception as e:
raise FedbiomedDatasetManagerError(f'Medical Folder Dataset was not saved properly and '
f'cannot be read. {e}')
if not dataset_id:
dataset_id = 'dataset_' + str(uuid.uuid4())
new_database = dict(name=name, data_type=data_type, tags=tags,
description=description, shape=shape,
path=path, dataset_id=dataset_id, dtypes=dtypes,
dataset_parameters=dataset_parameters)
if save_dlp:
dlp_id = self.save_data_loading_plan(data_loading_plan)
elif isinstance(data_loading_plan, DataLoadingPlan):
dlp_id = data_loading_plan.dlp_id
else:
dlp_id = None
if dlp_id is not None:
new_database['dlp_id'] = dlp_id
self._dataset_table.insert(new_database)
return dataset_id
get_by_id(dataset_id)
get_by_id(dataset_id)
Searches for data with given dataset_id.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset_id | str | A dataset id | required |
Returns:
Type | Description |
---|---|
List[dict] | A list of dict of matching datasets, each dict containing all the fields describing the matching datasets stored in Tiny database. |
Source code in fedbiomed/node/dataset_manager.py
def get_by_id(self, dataset_id: str) -> List[dict]:
"""Searches for data with given dataset_id.
Args:
dataset_id: A dataset id
Returns:
A list of dict of matching datasets, each dict
containing all the fields describing the matching datasets
stored in Tiny database.
"""
result = self._dataset_table.get(self._database.dataset_id == dataset_id)
return result
get_csv_data_types(dataset)
get_csv_data_types(dataset)
Gets data types of each variable in dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset | pd.DataFrame | A Pandas dataset. | required |
Returns:
Type | Description |
---|---|
List[str] | A list of strings containing data types. |
Source code in fedbiomed/node/dataset_manager.py
def get_csv_data_types(self, dataset: pd.DataFrame) -> List[str]:
"""Gets data types of each variable in dataset.
Args:
dataset: A Pandas dataset.
Returns:
A list of strings containing data types.
"""
types = [str(t) for t in dataset.dtypes]
return types
get_data_loading_blocks_by_ids(dlb_ids)
get_data_loading_blocks_by_ids(dlb_ids)
Search for a list of DataLoadingBlockTypes, each corresponding to one given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.
DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dlb_ids | List[str] | (List[str]) a list of DataLoadingBlock IDs | required |
Returns:
Type | Description |
---|---|
List[dict] | A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id. |
Source code in fedbiomed/node/dataset_manager.py
def get_data_loading_blocks_by_ids(self, dlb_ids: List[str]) -> List[dict]:
"""Search for a list of DataLoadingBlockTypes, each corresponding to one given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random
one with the sought id.
DataLoadingBlock IDs always start with 'serialized_data_loading_block_' and should be unique in the database.
Args:
dlb_ids: (List[str]) a list of DataLoadingBlock IDs
Returns:
A list of dictionaries, each one containing the DataLoadingBlock metadata corresponding to one given id.
"""
return self._dlp_table.search(self._database.dlb_id.one_of(dlb_ids))
get_dlp_by_id(dlp_id)
get_dlp_by_id(dlp_id)
Search for a DataLoadingPlan with a given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random one with the sought id.
DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dlp_id | str | (str) the DataLoadingPlan id | required |
Returns:
Type | Description |
---|---|
Tuple[dict, List[dict]] | A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id. |
Source code in fedbiomed/node/dataset_manager.py
def get_dlp_by_id(self, dlp_id: str) -> Tuple[dict, List[dict]]:
"""Search for a DataLoadingPlan with a given id.
Note that in case of conflicting ids (which should not happen), this function will silently return a random
one with the sought id.
DataLoadingPlan IDs always start with 'dlp_' and should be unique in the database.
Args:
dlp_id: (str) the DataLoadingPlan id
Returns:
A Tuple containing a dictionary with the DataLoadingPlan metadata corresponding to the given id.
"""
dlp_metadata = self._dlp_table.get(self._database.dlp_id == dlp_id)
return dlp_metadata, self._dlp_table.search(
self._database.dlb_id.one_of(dlp_metadata['loading_blocks'].values()))
get_torch_dataset_shape(dataset)
get_torch_dataset_shape(dataset)
Gets info about dataset shape.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset | torch.utils.data.Dataset | A Pytorch dataset | required |
Returns:
Type | Description |
---|---|
List[int] | A list of int containing [ |
Source code in fedbiomed/node/dataset_manager.py
def get_torch_dataset_shape(self, dataset: torch.utils.data.Dataset) -> List[int]:
"""Gets info about dataset shape.
Args:
dataset: A Pytorch dataset
Returns:
A list of int containing
[<nb_of_data>, <dimension_of_first_input_data>].
Example for MNIST: [60000, 1, 28, 28], where <nb_of_data>=60000
and <dimension_of_first_input_data>=1, 28, 28
"""
return [len(dataset)] + list(dataset[0][0].shape)
list_dlp(target_dataset_type=None)
list_dlp(target_dataset_type=None)
Return all existing DataLoadingPlans.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
target_dataset_type | Optional[str] | (str or None) if specified, return only dlps matching the requested target type. | None |
Returns:
Type | Description |
---|---|
List[dict] | An array of dict, each dict is a DataLoadingPlan |
Source code in fedbiomed/node/dataset_manager.py
def list_dlp(self, target_dataset_type: Optional[str] = None) -> List[dict]:
"""Return all existing DataLoadingPlans.
Args:
target_dataset_type: (str or None) if specified, return only dlps matching the requested target type.
Returns:
An array of dict, each dict is a DataLoadingPlan
"""
if target_dataset_type is not None:
if not isinstance(target_dataset_type, str):
raise FedbiomedDatasetManagerError(f"Wrong input type for target_dataset_type. "
f"Expected str, got {type(target_dataset_type)} instead.")
if target_dataset_type not in [t.value for t in DatasetTypes]:
raise FedbiomedDatasetManagerError("target_dataset_type should be of the values defined in "
"fedbiomed.common.constants.DatasetTypes")
dlps = self._dlp_table.search(
(self._database.dlp_id.exists()) &
(self._database.dlp_name.exists()) &
(self._database.target_dataset_type == target_dataset_type))
else:
dlps = self._dlp_table.search(
(self._database.dlp_id.exists()) & (self._database.dlp_name.exists()))
return [dict(dlp) for dlp in dlps]
list_my_data(verbose=True)
list_my_data(verbose=True)
Lists all datasets on the node.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
verbose | bool | Give verbose output. Defaults to True. | True |
Returns:
Type | Description |
---|---|
List[dict] | All datasets in the node's database. |
Source code in fedbiomed/node/dataset_manager.py
def list_my_data(self, verbose: bool = True) -> List[dict]:
"""Lists all datasets on the node.
Args:
verbose: Give verbose output. Defaults to True.
Returns:
All datasets in the node's database.
"""
my_data = self._dataset_table.all()
# Do not display dtypes
for doc in my_data:
doc.pop('dtypes')
if verbose:
print(tabulate(my_data, headers='keys'))
return my_data
load_as_dataloader(dataset)
load_as_dataloader(dataset)
Loads content of an image dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset | dict | Description of the dataset. | required |
Returns:
Type | Description |
---|---|
torch.utils.data.Dataset | Content of the dataset. |
Source code in fedbiomed/node/dataset_manager.py
def load_as_dataloader(self, dataset: dict) -> torch.utils.data.Dataset:
"""Loads content of an image dataset.
Args:
dataset: Description of the dataset.
Returns:
Content of the dataset.
"""
name = dataset['data_type']
if name == 'default':
return self.load_default_database(name=dataset['name'],
path=dataset['path'],
as_dataset=True)
elif name == 'images':
return self.load_images_dataset(folder_path=dataset['path'],
as_dataset=True)
load_csv_dataset(path)
load_csv_dataset(path)
Loads a CSV dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path | str | Path to the CSV file. | required |
Returns:
Type | Description |
---|---|
pd.DataFrame | Pandas DataFrame with the content of the file. |
Source code in fedbiomed/node/dataset_manager.py
def load_csv_dataset(self, path: str) -> pd.DataFrame:
"""Loads a CSV dataset.
Args:
path: Path to the CSV file.
Returns:
Pandas DataFrame with the content of the file.
"""
return self.read_csv(path)
load_data(tags, mode)
load_data(tags, mode)
Loads content of a dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tags | Union[tuple, list] | Tags describing the dataset to load. | required |
mode | str | Return format for the dataset content. | required |
Raises:
Type | Description |
---|---|
NotImplementedError |
|
Returns:
Type | Description |
---|---|
Any | Content of the dataset. Its type depends on the |
Source code in fedbiomed/node/dataset_manager.py
def load_data(self, tags: Union[tuple, list], mode: str) -> Any:
"""Loads content of a dataset.
Args:
tags: Tags describing the dataset to load.
mode: Return format for the dataset content.
Raises:
NotImplementedError: `mode` is not implemented yet.
Returns:
Content of the dataset. Its type depends on the `mode` and dataset.
"""
# Verify is mode is available
mode = mode.lower()
modes = ['pandas', 'torch_dataset', 'torch_tensor', 'numpy']
if mode not in modes:
raise NotImplementedError(f'Data mode `{mode}` was not found.'
f' Data modes available: {modes}')
# Look for dataset in database
dataset = self.search_by_tags(tags)[0]
print(dataset)
assert len(dataset) > 0, f'Dataset with tags {tags} was not found.'
dataset_path = dataset['path']
# If path is a file, you will aim to read it with
if os.path.isfile(dataset_path):
df = self.read_csv(dataset_path, index_col=0)
# Load data as requested
if mode == 'pandas':
return df
elif mode == 'numpy':
return df._get_numeric_data().values
elif mode == 'torch_tensor':
return torch.from_numpy(df._get_numeric_data().values)
elif os.path.isdir(dataset_path):
if mode == 'torch_dataset':
return self.load_as_dataloader(dataset)
elif mode == 'torch_tensor':
raise NotImplementedError('We are working on this'
' implementation!')
elif mode == 'numpy':
raise NotImplementedError('We are working on this'
'implementation!')
else:
raise NotImplementedError(f'Mode `{mode}` has not been'
' implemented on this version.')
load_default_database(name, path, as_dataset=False)
load_default_database(name, path, as_dataset=False)
Loads a default dataset.
Currently, only MNIST dataset is used as the default dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name | str | Name of the default dataset. Currently, only MNIST is accepted. | required |
path | str | Pathfile to MNIST dataset. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Raises:
Type | Description |
---|---|
NotImplementedError | Name is not matching with the name of a default dataset. |
Returns:
Type | Description |
---|---|
Union[List[int], torch.utils.data.Dataset] | Depends on the value of the parameter |
Union[List[int], torch.utils.data.Dataset] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Union[List[int], torch.utils.data.Dataset] | If set to False, returns the size of the dataset stored inside |
Union[List[int], torch.utils.data.Dataset] | a list (type: List[int]). |
Source code in fedbiomed/node/dataset_manager.py
def load_default_database(self,
name: str,
path: str,
as_dataset: bool = False) -> Union[List[int],
torch.utils.data.Dataset]:
"""Loads a default dataset.
Currently, only MNIST dataset is used as the default dataset.
Args:
name: Name of the default dataset. Currently,
only MNIST is accepted.
path: Pathfile to MNIST dataset.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Raises:
NotImplementedError: Name is not matching with
the name of a default dataset.
Returns:
Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int]).
"""
kwargs = dict(root=path, download=True, transform=transforms.ToTensor())
if 'mnist' in name.lower():
dataset = datasets.MNIST(**kwargs)
else:
raise NotImplementedError(f'Default dataset `{name}` has'
'not been implemented.')
if as_dataset:
return dataset
else:
return self.get_torch_dataset_shape(dataset)
load_images_dataset(folder_path, as_dataset=False)
load_images_dataset(folder_path, as_dataset=False)
Loads an image dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
folder_path | str | Path to the directory containing the images. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Returns:
Type | Description |
---|---|
Union[List[int], torch.utils.data.Dataset] | Depends on the value of the parameter |
Union[List[int], torch.utils.data.Dataset] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Union[List[int], torch.utils.data.Dataset] | If set to False, returns the size of the dataset stored inside |
Union[List[int], torch.utils.data.Dataset] | a list (type: List[int]) |
Source code in fedbiomed/node/dataset_manager.py
def load_images_dataset(self,
folder_path: str,
as_dataset: bool = False) -> Union[List[int],
torch.utils.data.Dataset]:
"""Loads an image dataset.
Args:
folder_path: Path to the directory containing the images.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Returns:
Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int])
"""
try:
dataset = datasets.ImageFolder(folder_path,
transform=transforms.ToTensor())
except Exception as e:
_msg = ErrorNumbers.FB315.value +\
"\nThe following error was raised while loading dataset from the selected" \
" path: " + str(e) + "\nPlease make sure that the selected folder is not empty \
and doesn't have any empty class folder"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if as_dataset:
return dataset
else:
return self.get_torch_dataset_shape(dataset)
load_mednist_database(path, as_dataset=False)
load_mednist_database(path, as_dataset=False)
Loads the MedNist dataset.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path | str | Pathfile to save a local copy of the MedNist dataset. | required |
as_dataset | bool | Whether to return the complete dataset (True) or dataset dimensions (False). Defaults to False. | False |
Raises:
Type | Description |
---|---|
FedbiomedDatasetManagerError | One of the following cases:
|
Returns:
Type | Description |
---|---|
Union[List[int], torch.utils.data.Dataset] | Depends on the value of the parameter |
Union[List[int], torch.utils.data.Dataset] | set to True, returns dataset (type: torch.utils.data.Dataset). |
Union[List[int], torch.utils.data.Dataset] | If set to False, returns the size of the dataset stored inside |
Union[List[int], torch.utils.data.Dataset] | a list (type: List[int]) |
Source code in fedbiomed/node/dataset_manager.py
def load_mednist_database(self,
path: str,
as_dataset: bool = False) -> Union[List[int],
torch.utils.data.Dataset]:
"""Loads the MedNist dataset.
Args:
path: Pathfile to save a local copy of the MedNist dataset.
as_dataset: Whether to return
the complete dataset (True) or dataset dimensions (False).
Defaults to False.
Raises:
FedbiomedDatasetManagerError: One of the following cases:
- tarfile cannot be downloaded
- downloaded tarfile cannot
be extracted
- MedNIST path is empty
- one of the classes path is empty
Returns:
Depends on the value of the parameter `as_dataset`: If
set to True, returns dataset (type: torch.utils.data.Dataset).
If set to False, returns the size of the dataset stored inside
a list (type: List[int])
"""
download_path = os.path.join(path, 'MedNIST')
if not os.path.isdir(download_path):
url = "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/MedNIST.tar.gz"
filepath = os.path.join(path, 'MedNIST.tar.gz')
try:
logger.info("Now downloading MEDNIST...")
urlretrieve(url, filepath)
with tarfile.open(filepath) as tar_file:
logger.info("Now extracting MEDNIST...")
tar_file.extractall(path)
os.remove(filepath)
except (URLError, HTTPError, ContentTooShortError, OSError, tarfile.TarError,
MemoryError) as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while downloading MedNIST dataset"\
+ "from the MONAI repo: " + str(e)
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
try:
dataset = datasets.ImageFolder(download_path,
transform=transforms.ToTensor())
except (FileNotFoundError, RuntimeError) as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset from"\
"the selected path: " + str(e) + "\nPlease make sure that the selected MedNIST folder is not empty \
or choose another path."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
except Exception as e:
_msg = ErrorNumbers.FB315.value + "\nThe following error was raised while loading MedNIST dataset" + str(e)
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if as_dataset:
return dataset
else:
return self.get_torch_dataset_shape(dataset)
modify_database_info(tags, modified_dataset)
modify_database_info(tags, modified_dataset)
Modifies a dataset in the database.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tags | Union[tuple, list] | Tags describing the dataset to modify. | required |
modified_dataset | dict | New dataset description to replace the existing one. | required |
Source code in fedbiomed/node/dataset_manager.py
def modify_database_info(self,
tags: Union[tuple, list],
modified_dataset: dict):
"""Modifies a dataset in the database.
Args:
tags: Tags describing the dataset to modify.
modified_dataset: New dataset description to replace the existing one.
"""
self._dataset_table.update(modified_dataset, self._database.tags.all(tags))
obfuscate_private_information(database_metadata)
staticmethod
obfuscate_private_information(database_metadata)
Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to prevent sharing this information with a researcher through a reply message.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
database_metadata | Iterable[dict] | an iterable of metadata information objects, one per dataset. Each metadata object should be in the format af key-value pairs, such as e.g. a dict. | required |
Returns:
Type | Description |
---|---|
Iterable[dict] | the updated iterable of metadata information objects without privacy-sensitive information |
Source code in fedbiomed/node/dataset_manager.py
@staticmethod
def obfuscate_private_information(database_metadata: Iterable[dict]) -> Iterable[dict]:
"""Remove privacy-sensitive information, to prepare for sharing with a researcher.
Removes any information that could be considered privacy-sensitive by the node. The typical use-case is to
prevent sharing this information with a researcher through a reply message.
Args:
database_metadata: an iterable of metadata information objects, one per dataset. Each metadata object
should be in the format af key-value pairs, such as e.g. a dict.
Returns:
the updated iterable of metadata information objects without privacy-sensitive information
"""
for d in database_metadata:
try:
# common obfuscations
d.pop('path', None)
# obfuscations specific for each data type
if 'data_type' in d:
if d['data_type'] == 'medical-folder':
if 'dataset_parameters' in d:
d['dataset_parameters'].pop('tabular_file', None)
except AttributeError:
raise FedbiomedDatasetManagerError(f"Object of type {type(d)} does not support pop or getitem method "
f"in obfuscate_private_information.")
return database_metadata
read_csv(csv_file, index_col=None)
read_csv(csv_file, index_col=None)
Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame. Finds automatically the CSV delimiter by parsing the first line.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
csv_file | str | File name / path | required |
index_col | Union[int, None] | Column that contains CSV file index. Defaults to None. | None |
Returns:
Type | Description |
---|---|
pd.DataFrame | Pandas DataFrame with data contained in CSV file. |
Source code in fedbiomed/node/dataset_manager.py
def read_csv(self, csv_file: str, index_col: Union[int, None] = None) -> pd.DataFrame:
"""Gets content of a CSV file.
Reads a *.csv file and outputs its data into a pandas DataFrame.
Finds automatically the CSV delimiter by parsing the first line.
Args:
csv_file: File name / path
index_col: Column that contains CSV file index.
Defaults to None.
Returns:
Pandas DataFrame with data contained in CSV file.
"""
# Automatically identify separator and header
sniffer = csv.Sniffer()
with open(csv_file, 'r') as file:
delimiter = sniffer.sniff(file.readline()).delimiter
file.seek(0)
header = 0 if sniffer.has_header(file.read()) else None
return pd.read_csv(csv_file, index_col=index_col, sep=delimiter, header=header)
remove_database(tags)
remove_database(tags)
Removes datasets from database.
Only datasets matching the tags
should be removed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tags | Union[tuple, list] | Dataset description tags. | required |
Source code in fedbiomed/node/dataset_manager.py
def remove_database(self, tags: Union[tuple, list]):
"""Removes datasets from database.
Only datasets matching the `tags` should be removed.
Args:
tags: Dataset description tags.
"""
doc_ids = [doc.doc_id for doc in self.search_by_tags(tags)]
self._dataset_table.remove(doc_ids=doc_ids)
remove_dlp_by_id(dlp_id)
remove_dlp_by_id(dlp_id)
Removes a data loading plan (DLP) from the database.
Only DLP with matching ID is removed from the database. There should be at most one.
If remove_dlbs
is True, also remove the attached DLBs. You should ensure they are not used by another DLP, no verification is made.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dlp_id | str | the DataLoadingPlan id | required |
Source code in fedbiomed/node/dataset_manager.py
def remove_dlp_by_id(self, dlp_id: str):
"""Removes a data loading plan (DLP) from the database.
Only DLP with matching ID is removed from the database. There should be at most one.
If `remove_dlbs` is True, also remove the attached DLBs. You should ensure
they are not used by another DLP, no verification is made.
Args:
dlp_id: the DataLoadingPlan id
"""
if not isinstance(dlp_id, str):
_msg = ErrorNumbers.FB316.value + f": Bad type for dlp '{type(dlp_id)}', expecting str"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
if not str:
_msg = ErrorNumbers.FB316.value + ": Bad value for dlp, expecting non empty str"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
_ , dlbs = self.get_dlp_by_id(dlp_id)
try:
self._dlp_table.remove(self._database.dlp_id == dlp_id)
for dlb in dlbs:
self._dlp_table.remove(self._database.dlb_id == dlb['dlb_id'])
except Exception as e:
_msg = ErrorNumbers.FB316.value + f": Error during remove of DLP {dlp_id}: {e}"
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
save_data_loading_block(dlb)
save_data_loading_block(dlb)
Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_block(self, dlb: DataLoadingBlock) -> None:
# seems unused
self._dlp_table.insert(dlb.serialize())
save_data_loading_plan(data_loading_plan)
save_data_loading_plan(data_loading_plan)
Save a DataLoadingPlan to the database.
This function saves a DataLoadingPlan to the database, and returns its ID.
Raises:
Type | Description |
---|---|
FedbiomedDatasetManagerError | bad data loading plan name (size, not unique) |
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data_loading_plan | Optional[DataLoadingPlan] | the DataLoadingPlan to be saved, or None. | required |
Returns:
Type | Description |
---|---|
dict | The |
Source code in fedbiomed/node/dataset_manager.py
def save_data_loading_plan(self,
data_loading_plan: Optional[DataLoadingPlan]
) -> dict:
"""Save a DataLoadingPlan to the database.
This function saves a DataLoadingPlan to the database, and returns its ID.
Raises:
FedbiomedDatasetManagerError: bad data loading plan name (size, not unique)
Args:
data_loading_plan: the DataLoadingPlan to be saved, or None.
Returns:
The `dlp_id` if a DLP was saved, or None
"""
if data_loading_plan is None:
return None
if len(data_loading_plan.desc) < 4:
_msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
"DLP name needs to have at least 4 characters."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
_dlp_same_name = self._dlp_table.search(
(self._database.dlp_id.exists()) & (self._database.dlp_name.exists()) &
(self._database.dlp_name == data_loading_plan.desc))
if _dlp_same_name:
_msg = ErrorNumbers.FB316.value + ": Cannot save data loading plan, " + \
"DLP name needs to be unique."
logger.error(_msg)
raise FedbiomedDatasetManagerError(_msg)
dlp_metadata, loading_blocks_metadata = data_loading_plan.serialize()
self._dlp_table.insert(dlp_metadata)
self._dlp_table.insert_multiple(loading_blocks_metadata)
return data_loading_plan.dlp_id
search_by_tags(tags)
search_by_tags(tags)
Searches for data with given tags.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
tags | Union[tuple, list] | List of tags | required |
Returns:
Type | Description |
---|---|
list | The list of matching datasets |
Source code in fedbiomed/node/dataset_manager.py
def search_by_tags(self, tags: Union[tuple, list]) -> list:
"""Searches for data with given tags.
Args:
tags: List of tags
Returns:
The list of matching datasets
"""
return self._dataset_table.search(self._database.tags.all(tags))