Module epiclass.core.metadata

Module from Metadata class and HealthyCategory.

Functions

def env_filtering(metadata: Metadata, category: str) ‑> List[str]

Filter metadata using environment variables. Return the list of classes/labels to consider.

Currently supports: EXCLUDE_LIST ASSAY_LIST LABEL_LIST REMOVE_TRACKS

Classes

class HealthyCategory

Create/Represent/manipulate the "healthy" metadata category

Expand source code
class HealthyCategory:
    """Create/Represent/manipulate the "healthy" metadata category"""

    def __init__(self):
        self.pairs_file = Path(__file__).parent / "healthy_category.tsv"
        self.healthy_dict = self.read_healthy_pairs()

    @staticmethod
    def get_healthy_pairs(datasets: Iterable[Dict]) -> Set[Tuple[str, str]]:
        """Return set of (disease, donor_health_status) pairs."""
        pairs = set([])
        for dataset in datasets:
            disease = dataset.get("disease", "--empty--")
            donor_health_status = dataset.get("donor_health_status", "--empty--")
            pairs.add((disease, donor_health_status))
        return pairs

    def list_healthy_pairs(self, datasets: Iterable[Dict]) -> None:
        """List unique (disease, donor_health_status) pairs."""
        for x1, x2 in sorted(self.get_healthy_pairs(datasets)):
            print(f"{x1}\t{x2}")

    def read_healthy_pairs(self) -> Dict[Tuple[str, str], str]:
        """Return a (disease, donor_health_status):healthy dict defined in
        a tsv file with disease|donor_health_status|healthy columns.
        """
        healthy_dict = {}
        with open(self.pairs_file, "r", encoding="utf-8") as tsv_file:
            next(tsv_file)  # skip header
            for line in tsv_file:
                disease, donor_health_status, healthy = line.rstrip("\n").split("\t")
                healthy_dict[(disease, donor_health_status)] = healthy
        return healthy_dict

    def get_healthy_status(self, dataset: Dict) -> str:
        """Return "y", "n" or "?" depending of the healthy status of the dataset."""
        disease = dataset.get("disease", "--empty--")
        donor_health_status = dataset.get("donor_health_status", "--empty--")
        return self.healthy_dict[(disease, donor_health_status)]

    @staticmethod
    def create_healthy_category(metadata: Metadata) -> None:
        """Combine "disease" and "donor_health_status" to create a "healthy" category.

        When a dataset has pairs with unknow correspondance, it does not add
        the category, and so these datasets are ignored through remove_missing_labels().
        """
        healthy_category = HealthyCategory()
        for dataset in metadata.datasets:
            healthy = healthy_category.get_healthy_status(dataset)
            if healthy == "?":
                continue
            dataset["healthy"] = healthy

Static methods

def create_healthy_category(metadata: Metadata) ‑> None

Combine "disease" and "donor_health_status" to create a "healthy" category.

When a dataset has pairs with unknow correspondance, it does not add the category, and so these datasets are ignored through remove_missing_labels().

def get_healthy_pairs(datasets: Iterable[Dict]) ‑> Set[Tuple[str, str]]

Return set of (disease, donor_health_status) pairs.

Methods

def get_healthy_status(self, dataset: Dict) ‑> str

Return "y", "n" or "?" depending of the healthy status of the dataset.

def list_healthy_pairs(self, datasets: Iterable[Dict]) ‑> None

List unique (disease, donor_health_status) pairs.

def read_healthy_pairs(self) ‑> Dict[Tuple[str, str], str]

Return a (disease, donor_health_status):healthy dict defined in a tsv file with disease|donor_health_status|healthy columns.

class Metadata (path: Path)

Wrapper around metadata md5:dataset dict.

path (Path): Path to json file containing metadata for some datasets.

Expand source code
class Metadata:
    """
    Wrapper around metadata md5:dataset dict.

    path (Path): Path to json file containing metadata for some datasets.
    """

    def __init__(self, path: Path):
        self._metadata = self._load_metadata(path)
        self._rest = {}

    @classmethod
    def from_dict(
        cls, metadata: Dict[str, dict], allow_non_md5sum_index: bool = False
    ) -> Metadata:
        """Creates an object from a dict conforming to {md5sum:dset} format."""
        first_key = list(metadata.keys())[0]
        if len(first_key) != 32:
            message = f"Incorrect format of metadata. Key need to be md5sum (len=32). First key is: {first_key}"
            if not allow_non_md5sum_index:
                raise ValueError(
                    f"Incorrect format of metadata. Key need to be md5sum (len=32). First key is: {first_key}"
                )
            print(message, file=sys.stderr)

        obj = cls.__new__(cls)
        obj._metadata = copy.deepcopy(metadata)
        obj._rest = {}
        return obj

    @classmethod
    def from_marshal(cls, path: Path | str) -> Metadata:
        """Load a metadata dict from a marshal file format."""
        with open(path, "rb") as file:
            metadata_dict = marshal.load(file)

        first_key = list(metadata_dict.keys())[0]
        if len(first_key) != 32:
            raise ValueError(
                f"Incorrect format of metadata. Key need to be md5sum (len=32). Is: {first_key}"
            )

        obj = cls.__new__(cls)
        obj._metadata = metadata_dict
        obj._rest = {}
        return obj

    def empty(self):
        """Remove all entries."""
        self._metadata = {}

    def __setitem__(self, md5, value):
        self._metadata[md5] = value

    def __getitem__(self, md5):
        return self._metadata[md5]

    def __delitem__(self, md5):
        del self._metadata[md5]

    def __contains__(self, md5):
        return md5 in self._metadata

    def __len__(self):
        return len(self._metadata)

    def __eq__(self, other):
        if isinstance(other, Metadata):
            return self._metadata == other._metadata and self._rest == other._rest
        return False

    def get(self, md5, default=None) -> Dict | None:
        """Dict .get"""
        return self._metadata.get(md5, default)

    def update(self, info: Metadata) -> None:
        """Dict .update equivalent. Info needs to respect {md5sum:dset} format."""
        self._metadata.update(info.items)

    def save(self, path) -> None:
        """Save the metadata to path, in original epigeec_json format."""
        self._save_metadata(path)

    @property
    def md5s(self) -> KeysView:
        """Return a md5s view (like dict.keys())."""
        return self._metadata.keys()

    @property
    def datasets(self) -> ValuesView:
        """Return a datasets view (like dict.values())."""
        return self._metadata.values()

    @property
    def items(self) -> ItemsView:
        """Return a (md5,datasets) view (like dict.items())."""
        return self._metadata.items()

    def to_df(self):
        """Return a dataframe with one file per row.
        Index is md5sum.
        """
        df = pd.DataFrame.from_records(list(self.datasets))
        df.set_index("md5sum", inplace=True)
        return df

    def _load_metadata(self, path):
        """Return md5:dataset dict."""
        with open(path, "r", encoding="utf-8") as file:
            meta_raw = json.load(file)

        self._rest = {k: v for k, v in meta_raw.items() if k != "datasets"}
        return {dset["md5sum"]: dset for dset in meta_raw["datasets"]}

    def _save_metadata(self, path):
        """Save the metadata to path, in original epigeec_json format.

        Only saves dataset information.
        """
        to_save = {"datasets": list(self.datasets)}
        to_save.update(self._rest)
        with open(path, "w", encoding="utf-8") as file:
            json.dump(to_save, file)

    def apply_filter(self, meta_filter=lambda item: True):
        """Apply a filter on items (md5:dataset)."""
        self._metadata = dict(filter(meta_filter, self.items))

    def remove_missing_labels(self, label_category: str):
        """Remove datasets where the metadata category is missing."""
        filt = lambda item: label_category in item[1]
        self.apply_filter(filt)  # type: ignore

    def md5_per_class(self, label_category: str) -> Dict[str, List[str]]:
        """Return {label/class:md5 list} dict for a given metadata category.

        Can fail if remove_missing_labels has not been ran before.
        """
        sorted_md5 = sorted(self.md5s)
        data = defaultdict(list)
        for md5 in sorted_md5:
            label = self[md5][label_category]
            data[label].append(md5)
        return data

    def remove_small_classes(
        self, min_class_size: int, label_category: str, verbose=True
    ):
        """Remove classes with less than min_class_size examples
        for a given metadata category.

        Returns string of class ratio left if verbose.
        """
        data = self.md5_per_class(label_category)
        nb_class = len(data)

        nb_removed_class = 0
        for label, size in self.label_counter(label_category).most_common():
            if size < min_class_size:
                nb_removed_class += 1
                for md5 in data[label]:
                    del self[md5]

        if verbose:
            remaining = nb_class - nb_removed_class
            ratio = f"{remaining}/{nb_class}"
            print(
                f"{ratio} labels left from {label_category} "
                f"after removing classes with less than {min_class_size} signals."
            )

    def _check_label_category(self, label_category: str):
        """Raise ValueError if label_category does not exist."""
        cats = self.get_categories()
        if label_category not in cats:
            ratios = []
            s1 = label_category
            for s2 in cats:
                ratios.append(SM(None, s1, s2).ratio())

            top5 = sorted(zip(cats, ratios), key=lambda x: x[1], reverse=True)[:5]
            top5 = [(label, f"{ratio:0.4f}") for label, ratio in top5]
            raise KeyError(
                f"Label category '{label_category}' not in categories. "
                f"Did you mean: {top5}"
            )

    def select_category_subsets(self, label_category: str, labels: Iterable[str]):
        """Select only datasets which possess the given labels
        for the given label category.

        Raises ValueError if label_category does not exist
        """
        if isinstance(labels, str):
            labels = [labels]
        self._check_label_category(label_category)
        filt = lambda item: item[1].get(label_category) in set(labels)
        self.apply_filter(filt)  # type: ignore

    def remove_category_subsets(self, label_category: str, labels: Iterable[str]):
        """Remove datasets which possess the given labels
        for the given label category.

        Raises ValueError if label_category does not exist
        """
        if isinstance(labels, str):
            labels = [labels]
        self._check_label_category(label_category)
        filt = lambda item: item[1].get(label_category) not in set(labels)
        self.apply_filter(filt)  # type: ignore

    def label_counter(self, label_category: str, verbose=True) -> Counter[str]:
        """Return a Counter() with label count from the given category.
        Ignores missing labels.
        """
        counter = Counter([dset.get(label_category) for dset in self.datasets])

        if verbose:
            print(f"{counter[None]} labels missing and ignored from count")
        del counter[None]

        return counter

    def unique_classes(self, label_category: str) -> List[str]:
        """Return sorted list of unique classes currently existing for the given category."""
        sorted_md5 = sorted(self.md5s)
        uniq = set()
        for md5 in sorted_md5:
            val = self[md5].get(label_category)

            # Everything should be a string, this was added because there was a bug with a nan object treated as a float
            if not isinstance(val, str):
                print(
                    f"md5: {md5} has non-string label of type {type(val)}: {val}",
                    file=sys.stderr,
                )
                raise ValueError(f"Non-string label for {label_category} at {md5}.")

            uniq.add(self[md5].get(label_category))
        uniq.discard(None)
        return sorted(list(uniq))

    def display_labels(self, label_category: str):
        """Print number of examples for each label in given category."""
        print(f"\nLabel breakdown for {label_category}")
        i = 0
        label_counter = self.label_counter(label_category)
        for label, count in label_counter.most_common():
            print(f"{label}: {count}")
            i += count
        print(f"For a total of {i} examples in {len(label_counter)} classes\n")

    def get_categories(self) -> list[str]:
        """Return a list of all metadata categories sorted by lowercase."""
        categories = {key for dset in self.datasets for key in dset.keys()}
        return sorted(categories, key=str.lower)

    def convert_classes(self, category: str, converter: Dict[str, str]):
        """Convert classes labels in the given category using the converter mapping.

        Can be used to merge classes.
        """
        for dataset in self.datasets:
            label = dataset.get(category, None)
            if label in converter:
                dataset[category] = converter[label]

    def save_marshal(self, path: Path | str) -> None:
        """Save the metadata to path, in marshal format. Only saves dataset information."""
        with open(path, "wb") as file:
            marshal.dump(self._metadata, file)

Subclasses

Static methods

def from_dict(metadata: Dict[str, dict], allow_non_md5sum_index: bool = False) ‑> Metadata

Creates an object from a dict conforming to {md5sum:dset} format.

def from_marshal(path: Path | str)

Load a metadata dict from a marshal file format.

Instance variables

prop datasets : ValuesView

Return a datasets view (like dict.values()).

Expand source code
@property
def datasets(self) -> ValuesView:
    """Return a datasets view (like dict.values())."""
    return self._metadata.values()
prop items : ItemsView

Return a (md5,datasets) view (like dict.items()).

Expand source code
@property
def items(self) -> ItemsView:
    """Return a (md5,datasets) view (like dict.items())."""
    return self._metadata.items()
prop md5s : KeysView

Return a md5s view (like dict.keys()).

Expand source code
@property
def md5s(self) -> KeysView:
    """Return a md5s view (like dict.keys())."""
    return self._metadata.keys()

Methods

def apply_filter(self, meta_filter=<function Metadata.<lambda>>)

Apply a filter on items (md5:dataset).

def convert_classes(self, category: str, converter: Dict[str, str])

Convert classes labels in the given category using the converter mapping.

Can be used to merge classes.

def display_labels(self, label_category: str)

Print number of examples for each label in given category.

def empty(self)

Remove all entries.

def get(self, md5, default=None)

Dict .get

def get_categories(self)

Return a list of all metadata categories sorted by lowercase.

def label_counter(self, label_category: str, verbose=True)

Return a Counter() with label count from the given category. Ignores missing labels.

def md5_per_class(self, label_category: str) ‑> Dict[str, List[str]]

Return {label/class:md5 list} dict for a given metadata category.

Can fail if remove_missing_labels has not been ran before.

def remove_category_subsets(self, label_category: str, labels: Iterable[str])

Remove datasets which possess the given labels for the given label category.

Raises ValueError if label_category does not exist

def remove_missing_labels(self, label_category: str)

Remove datasets where the metadata category is missing.

def remove_small_classes(self, min_class_size: int, label_category: str, verbose=True)

Remove classes with less than min_class_size examples for a given metadata category.

Returns string of class ratio left if verbose.

def save(self, path) ‑> None

Save the metadata to path, in original epigeec_json format.

def save_marshal(self, path: Path | str)

Save the metadata to path, in marshal format. Only saves dataset information.

def select_category_subsets(self, label_category: str, labels: Iterable[str])

Select only datasets which possess the given labels for the given label category.

Raises ValueError if label_category does not exist

def to_df(self)

Return a dataframe with one file per row. Index is md5sum.

def unique_classes(self, label_category: str) ‑> List[str]

Return sorted list of unique classes currently existing for the given category.

def update(self, info: Metadata) ‑> None

Dict .update equivalent. Info needs to respect {md5sum:dset} format.

class UUIDMetadata (path: Path)

Metadata class for UUID datasets, e.g. epiatlas.

Expand source code
class UUIDMetadata(Metadata):
    """Metadata class for UUID datasets, e.g. epiatlas."""

    @classmethod
    def from_dict(
        cls, metadata: Dict[str, dict], allow_non_md5sum_index: bool = False
    ) -> UUIDMetadata:
        """Creates an object from a dict conforming to {md5sum:dset} format."""
        first_key = list(metadata.keys())[0]
        if len(first_key) != 32:
            message = f"Incorrect format of metadata. Key need to be md5sum (len=32). First key is: {first_key}"
            if not allow_non_md5sum_index:
                raise ValueError(
                    f"Incorrect format of metadata. Key need to be md5sum (len=32). First key is: {first_key}"
                )
            print(message, file=sys.stderr)

        obj = cls.__new__(cls)
        obj._metadata = copy.deepcopy(metadata)
        obj._rest = {}
        return obj

    @classmethod
    def from_metadata(cls, metadata: Metadata) -> UUIDMetadata:
        """Create UUIDMetadata from Metadata."""
        meta = dict(metadata.items)
        return cls.from_dict(meta)

    @classmethod
    def from_marshal(cls, path: Path | str) -> UUIDMetadata:
        """Load a metadata dict from a marshal file format."""
        with open(path, "rb") as file:
            metadata_dict = marshal.load(file)

        first_key = list(metadata_dict.keys())[0]
        if len(first_key) != 32:
            raise ValueError(
                f"Incorrect format of metadata. Key need to be md5sum (len=32). Is: {first_key}"
            )

        obj = cls.__new__(cls)
        obj._metadata = metadata_dict
        obj._rest = {}
        return obj

    def __eq__(self, other):
        if isinstance(other, UUIDMetadata):
            return self._metadata == other._metadata and self._rest == other._rest
        return False

    def uuid_per_class(self, label_category: str) -> Dict[str, set[str]]:
        """Return {label/class:uuid list} dict for a given metadata category.

        Can fail if remove_missing_labels has not been ran before.
        """
        uuid_dict = defaultdict(set)
        for md5 in self._metadata:
            track_type = self._metadata[md5]["track_type"]
            label = self._metadata[md5][label_category]
            uuid = self._metadata[md5]["uuid"]

            # Special case for ctl_raw, same uuid as other tracks, but counts as unique experiment
            if track_type == "ctl_raw":
                uuid += "_ctl"

            uuid_dict[label].add(uuid)
        return uuid_dict

    def uuid_counter(self, label_category: str, verbose=True) -> Counter[str]:
        """Return a Counter() with uuid count from the given category.
        Ignores missing labels.
        """
        uuid_dict = self.uuid_per_class(label_category)
        uuid_counter = Counter({label: len(uuid_dict[label]) for label in uuid_dict})

        if verbose:
            print(f"{uuid_counter[None]} uuid missing and ignored from count")  # type: ignore
        del uuid_counter[None]

        return uuid_counter

    def display_uuid_per_class(self, label_category: str) -> None:
        """Display uuid_per_class for a given metadata category."""
        uuid_dict = self.uuid_per_class(label_category)
        uuid_counter = Counter({label: len(uuid_dict[label]) for label in uuid_dict})
        print(f"{label_category} label breakdown for unique experiments (uuid):")

        for label, c in uuid_counter.most_common():
            print(f"{label}: {c}")

        print(
            f"For {sum(uuid_counter.values())} unique experiments in {len(uuid_dict)} classes\n"
        )

    def uuid_to_md5(self) -> Dict[str, Dict[str, str]]:
        """Return uuid to {track_type:md5} mapping { uuid : {track_type1:md5sum, track_type2:md5sum, ...} }"""
        uuid_to_md5s = defaultdict(dict)
        for dset in self.datasets:
            uuid = dset["uuid"]
            uuid_to_md5s[uuid].update({dset["track_type"]: dset["md5sum"]})
        return uuid_to_md5s

    def remove_small_classes(
        self,
        min_class_size: int,
        label_category: str,
        verbose=True,
        using_uuid: bool = True,
    ):
        """Remove classes with less than min_class_size examples
        for a given metadata category.

        Counts unique uuids if using_uuid=True, else counts md5s.

        Returns string of class ratio left if verbose.
        """
        nb_class_init = len(self.unique_classes(label_category))

        if not using_uuid:
            md5_per_class = self.md5_per_class(label_category)
            for label, size in self.label_counter(label_category).most_common():
                if size < min_class_size:
                    for md5 in md5_per_class[label]:
                        del self[md5]
        else:
            uuid_to_md5s = self.uuid_to_md5()
            for label, uuids in self.uuid_per_class(label_category).items():
                if len(uuids) < min_class_size:
                    for uuid in uuids:
                        for md5 in uuid_to_md5s[uuid].values():
                            del self[md5]

        if verbose:
            remaining = len(self.unique_classes(label_category))
            ratio = f"{remaining}/{nb_class_init}"
            print(
                f"{ratio} labels left from {label_category} "
                f"after removing classes with less than {min_class_size} signals."
            )

Ancestors

Static methods

def from_metadata(metadata: Metadata) ‑> UUIDMetadata

Create UUIDMetadata from Metadata.

Methods

def display_uuid_per_class(self, label_category: str) ‑> None

Display uuid_per_class for a given metadata category.

def remove_small_classes(self, min_class_size: int, label_category: str, verbose=True, using_uuid: bool = True)

Remove classes with less than min_class_size examples for a given metadata category.

Counts unique uuids if using_uuid=True, else counts md5s.

Returns string of class ratio left if verbose.

def uuid_counter(self, label_category: str, verbose=True)

Return a Counter() with uuid count from the given category. Ignores missing labels.

def uuid_per_class(self, label_category: str)

Return {label/class:uuid list} dict for a given metadata category.

Can fail if remove_missing_labels has not been ran before.

def uuid_to_md5(self) ‑> Dict[str, Dict[str, str]]

Return uuid to {track_type:md5} mapping { uuid : {track_type1:md5sum, track_type2:md5sum, …} }

Inherited members