Module python.core.metadata
Module from Metadata class and HealthyCategory.
Expand source code
"""Module from Metadata class and HealthyCategory."""
# pylint: disable=unnecessary-lambda-assignment
from __future__ import annotations
import copy
import json
import os
from collections import Counter, defaultdict
from collections.abc import Iterable
from pathlib import Path
from typing import Dict, List
class Metadata(object):
"""
Wrapper around metadata md5:dataset dict.
path (Path): Path to json file containing metadata for some datasets.
"""
def __init__(self, path: Path):
self._metadata = self._load_metadata(path)
self._rest = {}
@classmethod
def from_dict(cls, metadata: Dict[str, dict]) -> Metadata:
"""Creates an object from a dict conforming to {md5sum:dset} format."""
first_key = list(metadata.keys())[0]
if len(first_key) != 32:
raise Exception(
f"Incorrect format of metadata. Key need to be md5sum (len=32). Is: {first_key}"
)
obj = cls.__new__(cls)
obj._metadata = copy.deepcopy(metadata)
return obj
def empty(self):
"""Remove all entries."""
self._metadata = {}
def __setitem__(self, md5, value):
self._metadata[md5] = value
def __getitem__(self, md5):
return self._metadata[md5]
def __delitem__(self, md5):
del self._metadata[md5]
def __contains__(self, md5):
return md5 in self._metadata
def __len__(self):
return len(self._metadata)
def get(self, md5, default=None):
"""Dict .get"""
return self._metadata.get(md5, default)
def update(self, info: Metadata) -> None:
"""Dict .update equivalent. Info needs to respect {md5sum:dset} format."""
self._metadata.update(info.items)
def save(self, path):
"""Save the metadata to path, in original epigeec_json format."""
self._save_metadata(path)
@property
def md5s(self):
"""Return md5s (iterator). dict.keys() equivalent."""
return self._metadata.keys()
@property
def datasets(self):
"""Return datasets (iterator). dict.values() equivalent."""
return self._metadata.values()
@property
def items(self):
"""Return pairs (iterator). dict.items() equivalent"""
return self._metadata.items()
def _load_metadata(self, path):
"""Return md5:dataset dict."""
with open(path, "r", encoding="utf-8") as file:
meta_raw = json.load(file)
self._rest = {k: v for k, v in meta_raw.items() if k != "datasets"}
return {dset["md5sum"]: dset for dset in meta_raw["datasets"]}
def _save_metadata(self, path):
"""Save the metadata to path, in original epigeec_json format.
Only saves dataset information.
"""
to_save = {"datasets": list(self.datasets)}
to_save.update(self._rest)
with open(path, "w", encoding="utf-8") as file:
json.dump(to_save, file)
def apply_filter(self, meta_filter=lambda item: True):
"""Apply a filter on items (md5:dataset)."""
self._metadata = dict(filter(meta_filter, self.items))
def remove_missing_labels(self, label_category: str):
"""Remove datasets where the metadata category is missing."""
filt = lambda item: label_category in item[1]
self.apply_filter(filt) # type: ignore
def md5_per_class(self, label_category: str) -> Dict[str, List[str]]:
"""Return {label/class:md5 list} dict for a given metadata category.
Can fail if remove_missing_labels has not been ran before.
"""
sorted_md5 = sorted(self.md5s)
data = defaultdict(list)
for md5 in sorted_md5:
label = self[md5][label_category]
data[label].append(md5)
return data
def remove_small_classes(
self, min_class_size: int, label_category: str, verbose=True
):
"""Remove classes with less than min_class_size examples
for a given metatada category.
Returns string of class ratio left if verbose.
"""
data = self.md5_per_class(label_category)
nb_class = len(data)
nb_removed_class = 0
for label, size in self.label_counter(label_category).most_common():
if size < min_class_size:
nb_removed_class += 1
for md5 in data[label]:
del self[md5]
if verbose:
remaining = nb_class - nb_removed_class
ratio = f"{remaining}/{nb_class}"
print(
f"{ratio} labels left from {label_category} "
f"after removing classes with less than {min_class_size} signals."
)
def select_category_subsets(self, label_category: str, labels: Iterable[str]):
"""Select only datasets which possess the given labels
for the given label category.
"""
filt = lambda item: item[1].get(label_category) in set(labels)
self.apply_filter(filt) # type: ignore
def remove_category_subsets(self, label_category: str, labels: Iterable[str]):
"""Remove datasets which possess the given labels
for the given label category.
"""
filt = lambda item: item[1].get(label_category) not in set(labels)
self.apply_filter(filt) # type: ignore
def label_counter(self, label_category: str, verbose=True) -> Counter[str]:
"""Return a Counter() with label count from the given category.
Ignores missing labels.
"""
counter = Counter([dset.get(label_category) for dset in self.datasets])
if verbose:
print(f"{counter[None]} labels missing and ignored from count")
del counter[None]
return counter
def unique_classes(self, label_category: str) -> List[str]:
"""Return sorted list of unique classes currently existing for the given category."""
sorted_md5 = sorted(self.md5s)
uniq = set()
for md5 in sorted_md5:
uniq.add(self[md5][label_category])
return sorted(list(uniq))
def display_labels(self, label_category: str):
"""Print number of examples for each label in given category."""
print(f"\nLabel breakdown for {label_category}")
i = 0
for label, count in self.label_counter(label_category).most_common():
print(f"{label}: {count}")
i += count
print(f"For a total of {i} examples\n")
def get_categories(self) -> list[str]:
"""Return a list of all metadata categories sorted by lowercase."""
categories = set()
for dset in self.datasets:
categories.update(dset.keys())
return sorted(categories, key=str.lower)
def convert_classes(self, category: str, converter: Dict[str, str]):
"""Convert classes labels in the given category using the converter mapping."""
for dataset in self.datasets:
label = dataset.get(category, None)
if label in converter:
dataset[category] = converter[label]
def env_filtering(metadata: Metadata, category: str) -> List[str]:
"""Filter metadata using environment variables.
Return the list of classes/labels to consider.
Currently supports:
EXCLUDE_LIST
ASSAY_LIST
LABEL_LIST
"""
print("Checking environment variables.")
# fmt: off
name = "ASSAY_LIST"
if os.getenv(name) is not None:
assay_list = json.loads(os.environ[name])
print(f"{name}: {assay_list}")
print(f"Filtering metadata: Only keeping examples with targets/assay {assay_list}")
metadata.select_category_subsets("assay", assay_list)
name = "EXCLUDE_LIST"
if os.getenv(name) is not None:
exclude_list = json.loads(os.environ[name])
print(f"{name}: {exclude_list}")
print(f"Filtering metadata: Removing labels {exclude_list} from category '{category}'.")
metadata.remove_category_subsets(label_category=category, labels=exclude_list)
name = "LABEL_LIST"
if os.getenv(name) is not None:
label_list = json.loads(os.environ[name])
print(f"{name}: {label_list}")
print(f"Filtering metadata: Only keeping examples with labels {label_list} from '{category}'")
metadata.select_category_subsets(category, label_list)
else:
label_list = metadata.unique_classes(category)
print(f"No label list, considering all left classes : {label_list}")
# fmt: on
return label_list
class HealthyCategory(object):
"""Create/Represent/manipulate the "healthy" metadata category"""
def __init__(self):
self.pairs_file = Path(__file__).parent / "healthy_category.tsv"
self.healthy_dict = self.read_healthy_pairs()
@staticmethod
def get_healthy_pairs(datasets):
"""Return set of (disease, donor_health_status) pairs."""
pairs = set([])
for dataset in datasets:
disease = dataset.get("disease", "--empty--")
donor_health_status = dataset.get("donor_health_status", "--empty--")
pairs.add((disease, donor_health_status))
return pairs
def list_healthy_pairs(self, datasets):
"""List unique (disease, donor_health_status) pairs."""
for x1, x2 in sorted(self.get_healthy_pairs(datasets)):
print(f"{x1}\t{x2}")
def read_healthy_pairs(self):
"""Return a (disease, donor_health_status):healthy dict defined in
a tsv file with disease|donor_health_status|healthy columns.
"""
healthy_dict = {}
with open(self.pairs_file, "r", encoding="utf-8") as tsv_file:
next(tsv_file) # skip header
for line in tsv_file:
disease, donor_health_status, healthy = line.rstrip("\n").split("\t")
healthy_dict[(disease, donor_health_status)] = healthy
return healthy_dict
def get_healthy_status(self, dataset):
"""Return "y", "n" or "?" depending of the healthy status of the dataset."""
disease = dataset.get("disease", "--empty--")
donor_health_status = dataset.get("donor_health_status", "--empty--")
return self.healthy_dict[(disease, donor_health_status)]
@staticmethod
def create_healthy_category(metadata: Metadata):
"""Combine "disease" and "donor_health_status" to create a "healthy" category.
When a dataset has pairs with unknow correspondance, it does not add
the category, and so these datasets are ignored through remove_missing_labels().
"""
healthy_category = HealthyCategory()
for dataset in metadata.datasets:
healthy = healthy_category.get_healthy_status(dataset)
if healthy == "?":
continue
dataset["healthy"] = healthy
Functions
def env_filtering(metadata: Metadata, category: str) ‑> List[str]
-
Filter metadata using environment variables. Return the list of classes/labels to consider.
Currently supports: EXCLUDE_LIST ASSAY_LIST LABEL_LIST
Expand source code
def env_filtering(metadata: Metadata, category: str) -> List[str]: """Filter metadata using environment variables. Return the list of classes/labels to consider. Currently supports: EXCLUDE_LIST ASSAY_LIST LABEL_LIST """ print("Checking environment variables.") # fmt: off name = "ASSAY_LIST" if os.getenv(name) is not None: assay_list = json.loads(os.environ[name]) print(f"{name}: {assay_list}") print(f"Filtering metadata: Only keeping examples with targets/assay {assay_list}") metadata.select_category_subsets("assay", assay_list) name = "EXCLUDE_LIST" if os.getenv(name) is not None: exclude_list = json.loads(os.environ[name]) print(f"{name}: {exclude_list}") print(f"Filtering metadata: Removing labels {exclude_list} from category '{category}'.") metadata.remove_category_subsets(label_category=category, labels=exclude_list) name = "LABEL_LIST" if os.getenv(name) is not None: label_list = json.loads(os.environ[name]) print(f"{name}: {label_list}") print(f"Filtering metadata: Only keeping examples with labels {label_list} from '{category}'") metadata.select_category_subsets(category, label_list) else: label_list = metadata.unique_classes(category) print(f"No label list, considering all left classes : {label_list}") # fmt: on return label_list
Classes
class HealthyCategory
-
Create/Represent/manipulate the "healthy" metadata category
Expand source code
class HealthyCategory(object): """Create/Represent/manipulate the "healthy" metadata category""" def __init__(self): self.pairs_file = Path(__file__).parent / "healthy_category.tsv" self.healthy_dict = self.read_healthy_pairs() @staticmethod def get_healthy_pairs(datasets): """Return set of (disease, donor_health_status) pairs.""" pairs = set([]) for dataset in datasets: disease = dataset.get("disease", "--empty--") donor_health_status = dataset.get("donor_health_status", "--empty--") pairs.add((disease, donor_health_status)) return pairs def list_healthy_pairs(self, datasets): """List unique (disease, donor_health_status) pairs.""" for x1, x2 in sorted(self.get_healthy_pairs(datasets)): print(f"{x1}\t{x2}") def read_healthy_pairs(self): """Return a (disease, donor_health_status):healthy dict defined in a tsv file with disease|donor_health_status|healthy columns. """ healthy_dict = {} with open(self.pairs_file, "r", encoding="utf-8") as tsv_file: next(tsv_file) # skip header for line in tsv_file: disease, donor_health_status, healthy = line.rstrip("\n").split("\t") healthy_dict[(disease, donor_health_status)] = healthy return healthy_dict def get_healthy_status(self, dataset): """Return "y", "n" or "?" depending of the healthy status of the dataset.""" disease = dataset.get("disease", "--empty--") donor_health_status = dataset.get("donor_health_status", "--empty--") return self.healthy_dict[(disease, donor_health_status)] @staticmethod def create_healthy_category(metadata: Metadata): """Combine "disease" and "donor_health_status" to create a "healthy" category. When a dataset has pairs with unknow correspondance, it does not add the category, and so these datasets are ignored through remove_missing_labels(). """ healthy_category = HealthyCategory() for dataset in metadata.datasets: healthy = healthy_category.get_healthy_status(dataset) if healthy == "?": continue dataset["healthy"] = healthy
Static methods
def create_healthy_category(metadata: Metadata)
-
Combine "disease" and "donor_health_status" to create a "healthy" category.
When a dataset has pairs with unknow correspondance, it does not add the category, and so these datasets are ignored through remove_missing_labels().
Expand source code
@staticmethod def create_healthy_category(metadata: Metadata): """Combine "disease" and "donor_health_status" to create a "healthy" category. When a dataset has pairs with unknow correspondance, it does not add the category, and so these datasets are ignored through remove_missing_labels(). """ healthy_category = HealthyCategory() for dataset in metadata.datasets: healthy = healthy_category.get_healthy_status(dataset) if healthy == "?": continue dataset["healthy"] = healthy
def get_healthy_pairs(datasets)
-
Return set of (disease, donor_health_status) pairs.
Expand source code
@staticmethod def get_healthy_pairs(datasets): """Return set of (disease, donor_health_status) pairs.""" pairs = set([]) for dataset in datasets: disease = dataset.get("disease", "--empty--") donor_health_status = dataset.get("donor_health_status", "--empty--") pairs.add((disease, donor_health_status)) return pairs
Methods
def get_healthy_status(self, dataset)
-
Return "y", "n" or "?" depending of the healthy status of the dataset.
Expand source code
def get_healthy_status(self, dataset): """Return "y", "n" or "?" depending of the healthy status of the dataset.""" disease = dataset.get("disease", "--empty--") donor_health_status = dataset.get("donor_health_status", "--empty--") return self.healthy_dict[(disease, donor_health_status)]
def list_healthy_pairs(self, datasets)
-
List unique (disease, donor_health_status) pairs.
Expand source code
def list_healthy_pairs(self, datasets): """List unique (disease, donor_health_status) pairs.""" for x1, x2 in sorted(self.get_healthy_pairs(datasets)): print(f"{x1}\t{x2}")
def read_healthy_pairs(self)
-
Return a (disease, donor_health_status):healthy dict defined in a tsv file with disease|donor_health_status|healthy columns.
Expand source code
def read_healthy_pairs(self): """Return a (disease, donor_health_status):healthy dict defined in a tsv file with disease|donor_health_status|healthy columns. """ healthy_dict = {} with open(self.pairs_file, "r", encoding="utf-8") as tsv_file: next(tsv_file) # skip header for line in tsv_file: disease, donor_health_status, healthy = line.rstrip("\n").split("\t") healthy_dict[(disease, donor_health_status)] = healthy return healthy_dict
class Metadata (path: Path)
-
Wrapper around metadata md5:dataset dict.
path (Path): Path to json file containing metadata for some datasets.
Expand source code
class Metadata(object): """ Wrapper around metadata md5:dataset dict. path (Path): Path to json file containing metadata for some datasets. """ def __init__(self, path: Path): self._metadata = self._load_metadata(path) self._rest = {} @classmethod def from_dict(cls, metadata: Dict[str, dict]) -> Metadata: """Creates an object from a dict conforming to {md5sum:dset} format.""" first_key = list(metadata.keys())[0] if len(first_key) != 32: raise Exception( f"Incorrect format of metadata. Key need to be md5sum (len=32). Is: {first_key}" ) obj = cls.__new__(cls) obj._metadata = copy.deepcopy(metadata) return obj def empty(self): """Remove all entries.""" self._metadata = {} def __setitem__(self, md5, value): self._metadata[md5] = value def __getitem__(self, md5): return self._metadata[md5] def __delitem__(self, md5): del self._metadata[md5] def __contains__(self, md5): return md5 in self._metadata def __len__(self): return len(self._metadata) def get(self, md5, default=None): """Dict .get""" return self._metadata.get(md5, default) def update(self, info: Metadata) -> None: """Dict .update equivalent. Info needs to respect {md5sum:dset} format.""" self._metadata.update(info.items) def save(self, path): """Save the metadata to path, in original epigeec_json format.""" self._save_metadata(path) @property def md5s(self): """Return md5s (iterator). dict.keys() equivalent.""" return self._metadata.keys() @property def datasets(self): """Return datasets (iterator). dict.values() equivalent.""" return self._metadata.values() @property def items(self): """Return pairs (iterator). dict.items() equivalent""" return self._metadata.items() def _load_metadata(self, path): """Return md5:dataset dict.""" with open(path, "r", encoding="utf-8") as file: meta_raw = json.load(file) self._rest = {k: v for k, v in meta_raw.items() if k != "datasets"} return {dset["md5sum"]: dset for dset in meta_raw["datasets"]} def _save_metadata(self, path): """Save the metadata to path, in original epigeec_json format. Only saves dataset information. """ to_save = {"datasets": list(self.datasets)} to_save.update(self._rest) with open(path, "w", encoding="utf-8") as file: json.dump(to_save, file) def apply_filter(self, meta_filter=lambda item: True): """Apply a filter on items (md5:dataset).""" self._metadata = dict(filter(meta_filter, self.items)) def remove_missing_labels(self, label_category: str): """Remove datasets where the metadata category is missing.""" filt = lambda item: label_category in item[1] self.apply_filter(filt) # type: ignore def md5_per_class(self, label_category: str) -> Dict[str, List[str]]: """Return {label/class:md5 list} dict for a given metadata category. Can fail if remove_missing_labels has not been ran before. """ sorted_md5 = sorted(self.md5s) data = defaultdict(list) for md5 in sorted_md5: label = self[md5][label_category] data[label].append(md5) return data def remove_small_classes( self, min_class_size: int, label_category: str, verbose=True ): """Remove classes with less than min_class_size examples for a given metatada category. Returns string of class ratio left if verbose. """ data = self.md5_per_class(label_category) nb_class = len(data) nb_removed_class = 0 for label, size in self.label_counter(label_category).most_common(): if size < min_class_size: nb_removed_class += 1 for md5 in data[label]: del self[md5] if verbose: remaining = nb_class - nb_removed_class ratio = f"{remaining}/{nb_class}" print( f"{ratio} labels left from {label_category} " f"after removing classes with less than {min_class_size} signals." ) def select_category_subsets(self, label_category: str, labels: Iterable[str]): """Select only datasets which possess the given labels for the given label category. """ filt = lambda item: item[1].get(label_category) in set(labels) self.apply_filter(filt) # type: ignore def remove_category_subsets(self, label_category: str, labels: Iterable[str]): """Remove datasets which possess the given labels for the given label category. """ filt = lambda item: item[1].get(label_category) not in set(labels) self.apply_filter(filt) # type: ignore def label_counter(self, label_category: str, verbose=True) -> Counter[str]: """Return a Counter() with label count from the given category. Ignores missing labels. """ counter = Counter([dset.get(label_category) for dset in self.datasets]) if verbose: print(f"{counter[None]} labels missing and ignored from count") del counter[None] return counter def unique_classes(self, label_category: str) -> List[str]: """Return sorted list of unique classes currently existing for the given category.""" sorted_md5 = sorted(self.md5s) uniq = set() for md5 in sorted_md5: uniq.add(self[md5][label_category]) return sorted(list(uniq)) def display_labels(self, label_category: str): """Print number of examples for each label in given category.""" print(f"\nLabel breakdown for {label_category}") i = 0 for label, count in self.label_counter(label_category).most_common(): print(f"{label}: {count}") i += count print(f"For a total of {i} examples\n") def get_categories(self) -> list[str]: """Return a list of all metadata categories sorted by lowercase.""" categories = set() for dset in self.datasets: categories.update(dset.keys()) return sorted(categories, key=str.lower) def convert_classes(self, category: str, converter: Dict[str, str]): """Convert classes labels in the given category using the converter mapping.""" for dataset in self.datasets: label = dataset.get(category, None) if label in converter: dataset[category] = converter[label]
Static methods
def from_dict(metadata: Dict[str, dict]) ‑> Metadata
-
Creates an object from a dict conforming to {md5sum:dset} format.
Expand source code
@classmethod def from_dict(cls, metadata: Dict[str, dict]) -> Metadata: """Creates an object from a dict conforming to {md5sum:dset} format.""" first_key = list(metadata.keys())[0] if len(first_key) != 32: raise Exception( f"Incorrect format of metadata. Key need to be md5sum (len=32). Is: {first_key}" ) obj = cls.__new__(cls) obj._metadata = copy.deepcopy(metadata) return obj
Instance variables
var datasets
-
Return datasets (iterator). dict.values() equivalent.
Expand source code
@property def datasets(self): """Return datasets (iterator). dict.values() equivalent.""" return self._metadata.values()
var items
-
Return pairs (iterator). dict.items() equivalent
Expand source code
@property def items(self): """Return pairs (iterator). dict.items() equivalent""" return self._metadata.items()
var md5s
-
Return md5s (iterator). dict.keys() equivalent.
Expand source code
@property def md5s(self): """Return md5s (iterator). dict.keys() equivalent.""" return self._metadata.keys()
Methods
def apply_filter(self, meta_filter=<function Metadata.<lambda>>)
-
Apply a filter on items (md5:dataset).
Expand source code
def apply_filter(self, meta_filter=lambda item: True): """Apply a filter on items (md5:dataset).""" self._metadata = dict(filter(meta_filter, self.items))
def convert_classes(self, category: str, converter: Dict[str, str])
-
Convert classes labels in the given category using the converter mapping.
Expand source code
def convert_classes(self, category: str, converter: Dict[str, str]): """Convert classes labels in the given category using the converter mapping.""" for dataset in self.datasets: label = dataset.get(category, None) if label in converter: dataset[category] = converter[label]
def display_labels(self, label_category: str)
-
Print number of examples for each label in given category.
Expand source code
def display_labels(self, label_category: str): """Print number of examples for each label in given category.""" print(f"\nLabel breakdown for {label_category}") i = 0 for label, count in self.label_counter(label_category).most_common(): print(f"{label}: {count}") i += count print(f"For a total of {i} examples\n")
def empty(self)
-
Remove all entries.
Expand source code
def empty(self): """Remove all entries.""" self._metadata = {}
def get(self, md5, default=None)
-
Dict .get
Expand source code
def get(self, md5, default=None): """Dict .get""" return self._metadata.get(md5, default)
def get_categories(self) ‑> list[str]
-
Return a list of all metadata categories sorted by lowercase.
Expand source code
def get_categories(self) -> list[str]: """Return a list of all metadata categories sorted by lowercase.""" categories = set() for dset in self.datasets: categories.update(dset.keys()) return sorted(categories, key=str.lower)
def label_counter(self, label_category: str, verbose=True) ‑> Counter[str]
-
Return a Counter() with label count from the given category. Ignores missing labels.
Expand source code
def label_counter(self, label_category: str, verbose=True) -> Counter[str]: """Return a Counter() with label count from the given category. Ignores missing labels. """ counter = Counter([dset.get(label_category) for dset in self.datasets]) if verbose: print(f"{counter[None]} labels missing and ignored from count") del counter[None] return counter
def md5_per_class(self, label_category: str) ‑> Dict[str, List[str]]
-
Return {label/class:md5 list} dict for a given metadata category.
Can fail if remove_missing_labels has not been ran before.
Expand source code
def md5_per_class(self, label_category: str) -> Dict[str, List[str]]: """Return {label/class:md5 list} dict for a given metadata category. Can fail if remove_missing_labels has not been ran before. """ sorted_md5 = sorted(self.md5s) data = defaultdict(list) for md5 in sorted_md5: label = self[md5][label_category] data[label].append(md5) return data
def remove_category_subsets(self, label_category: str, labels: Iterable[str])
-
Remove datasets which possess the given labels for the given label category.
Expand source code
def remove_category_subsets(self, label_category: str, labels: Iterable[str]): """Remove datasets which possess the given labels for the given label category. """ filt = lambda item: item[1].get(label_category) not in set(labels) self.apply_filter(filt) # type: ignore
def remove_missing_labels(self, label_category: str)
-
Remove datasets where the metadata category is missing.
Expand source code
def remove_missing_labels(self, label_category: str): """Remove datasets where the metadata category is missing.""" filt = lambda item: label_category in item[1] self.apply_filter(filt) # type: ignore
def remove_small_classes(self, min_class_size: int, label_category: str, verbose=True)
-
Remove classes with less than min_class_size examples for a given metatada category.
Returns string of class ratio left if verbose.
Expand source code
def remove_small_classes( self, min_class_size: int, label_category: str, verbose=True ): """Remove classes with less than min_class_size examples for a given metatada category. Returns string of class ratio left if verbose. """ data = self.md5_per_class(label_category) nb_class = len(data) nb_removed_class = 0 for label, size in self.label_counter(label_category).most_common(): if size < min_class_size: nb_removed_class += 1 for md5 in data[label]: del self[md5] if verbose: remaining = nb_class - nb_removed_class ratio = f"{remaining}/{nb_class}" print( f"{ratio} labels left from {label_category} " f"after removing classes with less than {min_class_size} signals." )
def save(self, path)
-
Save the metadata to path, in original epigeec_json format.
Expand source code
def save(self, path): """Save the metadata to path, in original epigeec_json format.""" self._save_metadata(path)
def select_category_subsets(self, label_category: str, labels: Iterable[str])
-
Select only datasets which possess the given labels for the given label category.
Expand source code
def select_category_subsets(self, label_category: str, labels: Iterable[str]): """Select only datasets which possess the given labels for the given label category. """ filt = lambda item: item[1].get(label_category) in set(labels) self.apply_filter(filt) # type: ignore
def unique_classes(self, label_category: str) ‑> List[str]
-
Return sorted list of unique classes currently existing for the given category.
Expand source code
def unique_classes(self, label_category: str) -> List[str]: """Return sorted list of unique classes currently existing for the given category.""" sorted_md5 = sorted(self.md5s) uniq = set() for md5 in sorted_md5: uniq.add(self[md5][label_category]) return sorted(list(uniq))
def update(self, info: Metadata) ‑> None
-
Dict .update equivalent. Info needs to respect {md5sum:dset} format.
Expand source code
def update(self, info: Metadata) -> None: """Dict .update equivalent. Info needs to respect {md5sum:dset} format.""" self._metadata.update(info.items)