Module epiclass.core.epiatlas_treatment
Functions to split epiatlas datasets properly, keeping track types together in the different sets.
Classes
class EpiAtlasDataset (datasource: EpiDataSource, label_category: str, label_list: List[str] | None = None, min_class_size: int = 10, md5_list: List[str] | None = None, force_filter: bool = True, metadata: UUIDMetadata | None = None)
-
Class that handles how epiatlas data signals are linked together.
Parameters
datasource
:EpiDataSource
- Where everything is read from.
label_category
:str
- The target category of labels to use.
label_list
:List[str]
, optional- List of labels/classes to include from given category
min_class_size
:int
, optional- Minimum number of samples per class.
md5_list
:List[str]
, optional- List of datasource md5s to include in the dataset. If None, everything is used and usual filter methods are used. (using min_class_size and label_list)
force_filter
:bool
, optional- If True, will filter the metadata even if md5_list is given. If False, will not filter the metadata if md5_list.
metadata
:UUIDMetadata
, optional- If given, will use this metadata instead of loading it from the datasource.
Expand source code
class EpiAtlasDataset: """Class that handles how epiatlas data signals are linked together. Parameters ---------- datasource : EpiDataSource Where everything is read from. label_category : str The target category of labels to use. label_list : List[str], optional List of labels/classes to include from given category min_class_size : int, optional Minimum number of samples per class. md5_list : List[str], optional List of datasource md5s to include in the dataset. If None, everything is used and usual filter methods are used. (using min_class_size and label_list) force_filter : bool, optional If True, will filter the metadata even if md5_list is given. If False, will not filter the metadata if md5_list. metadata : UUIDMetadata, optional If given, will use this metadata instead of loading it from the datasource. """ def __init__( self, datasource: EpiDataSource, label_category: str, label_list: List[str] | None = None, min_class_size: int = 10, md5_list: List[str] | None = None, force_filter: bool = True, metadata: UUIDMetadata | None = None, ): self._datasource = datasource self._label_category = label_category self._label_list = label_list # Load metadata meta = metadata if meta is None: meta = UUIDMetadata(self._datasource.metadata_file) if md5_list: try: meta = UUIDMetadata.from_dict({md5: meta[md5] for md5 in md5_list}) except KeyError as e: raise KeyError(f"md5 {e} from md5 list not found in metadata") from e if force_filter or not md5_list: meta = self._filter_metadata(min_class_size, meta, verbose=True) self._metadata = meta # Classes info self._classes = self._metadata.unique_classes(self._label_category) self._classes_mapping = {label: i for i, label in enumerate(self._classes)} # UUID info self._metadata.display_uuid_per_class(self._label_category) self._uuid_mapping = self._metadata.uuid_to_md5() # Load signals and create proper dataset self._signals = self._load_signals() md5s = list(self._signals.keys()) labels = [self._metadata[md5][self._label_category] for md5 in md5s] self._dataset: data.KnownData = data.KnownData( ids=md5s, x=list(self._signals.values()), y_str=labels, y=[self._classes_mapping[label] for label in labels], metadata=self._metadata, ) @property def datasource(self) -> EpiDataSource: """Return given datasource.""" return self._datasource @property def target_category(self) -> str: """Return given label category (e.g. assay)""" return self._label_category @property def label_list(self) -> List[str] | None: """Return given target labels inclusion list.""" return self._label_list @property def classes(self) -> List[str]: """Return target classes""" return self._classes @property def metadata(self) -> UUIDMetadata: """Return a copy of current metadata held""" return copy.deepcopy(self._metadata) @property def signals(self) -> Dict[str, np.ndarray]: """Return loaded signals.""" return self._signals @property def dataset(self) -> data.KnownData: """Return dataset.""" return self._dataset def _load_signals(self) -> Dict[str, np.ndarray]: """Load signals from given datasource.""" loader = Hdf5Loader(chrom_file=self.datasource.chromsize_file, normalization=True) loader = loader.load_hdf5s( data_file=self.datasource.hdf5_file, md5s=self.metadata.md5s, strict=True, verbose=True, ) return loader.signals def _filter_metadata( self, min_class_size: int, metadata: UUIDMetadata, verbose: bool ) -> UUIDMetadata: """Filter entry metadata for given files, assay list and label_category.""" files = Hdf5Loader.read_list(self.datasource.hdf5_file) # Remove metadata not associated with files metadata.apply_filter(lambda item: item[0] in files) metadata.remove_missing_labels(self.target_category) if self.label_list is not None: metadata.select_category_subsets(self.target_category, self.label_list) metadata.remove_small_classes( min_class_size, self.target_category, verbose, using_uuid=True ) return metadata
Subclasses
Instance variables
prop classes : List[str]
-
Return target classes
Expand source code
@property def classes(self) -> List[str]: """Return target classes""" return self._classes
prop dataset : data.KnownData
-
Return dataset.
Expand source code
@property def dataset(self) -> data.KnownData: """Return dataset.""" return self._dataset
prop datasource : EpiDataSource
-
Return given datasource.
Expand source code
@property def datasource(self) -> EpiDataSource: """Return given datasource.""" return self._datasource
prop label_list : List[str] | None
-
Return given target labels inclusion list.
Expand source code
@property def label_list(self) -> List[str] | None: """Return given target labels inclusion list.""" return self._label_list
prop metadata : UUIDMetadata
-
Return a copy of current metadata held
Expand source code
@property def metadata(self) -> UUIDMetadata: """Return a copy of current metadata held""" return copy.deepcopy(self._metadata)
prop signals : Dict[str, np.ndarray]
-
Return loaded signals.
Expand source code
@property def signals(self) -> Dict[str, np.ndarray]: """Return loaded signals.""" return self._signals
prop target_category : str
-
Return given label category (e.g. assay)
Expand source code
@property def target_category(self) -> str: """Return given label category (e.g. assay)""" return self._label_category
class EpiAtlasFoldFactory (epiatlas_dataset: EpiAtlasDataset, n_fold: int = 10, test_ratio: float = 0)
-
Class that handles how epiatlas data is split into training, validation, and testing sets.
Parameters
epiatlas_dataset
:EpiAtlasDataset
- Source container for epiatlas data.
n_fold
:int
, optional- Number of folds for cross-validation.
test_ratio
:float
, optional- Ratio of data kept for test (not used for training or validation)
Expand source code
class EpiAtlasFoldFactory: """Class that handles how epiatlas data is split into training, validation, and testing sets. Parameters ---------- epiatlas_dataset : EpiAtlasDataset Source container for epiatlas data. n_fold : int, optional Number of folds for cross-validation. test_ratio : float, optional Ratio of data kept for test (not used for training or validation) """ def __init__( self, epiatlas_dataset: EpiAtlasDataset, n_fold: int = 10, test_ratio: float = 0, ): self.k = n_fold if n_fold < 2: raise ValueError( f"Need at least two folds for cross-validation. Got {n_fold}." ) self.test_ratio = test_ratio if test_ratio < 0 or test_ratio > 1: raise ValueError(f"test_ratio must be between 0 and 1. Got {test_ratio}.") self._epiatlas_dataset = epiatlas_dataset self._classes = self._epiatlas_dataset.classes self._train_val, self._test = self._reserve_test() if len(self._train_val) == 0: raise ValueError("No data in training and validation.") @classmethod def from_datasource( cls, datasource: EpiDataSource, label_category: str, label_list: List[str] | None = None, min_class_size: int = 10, test_ratio: float = 0, n_fold: int = 10, md5_list: List[str] | None = None, force_filter: bool = True, metadata: UUIDMetadata | None = None, ): """Create EpiAtlasFoldFactory from a given EpiDataSource, directly create the intermediary EpiAtlasDataset. See EpiAtlasDataset init parameters for more details. """ epiatlas_dataset = EpiAtlasDataset( datasource, label_category, label_list, min_class_size, md5_list, force_filter, metadata, ) return cls(epiatlas_dataset, n_fold, test_ratio) @property def n_fold(self) -> int: """Returns expected number of folds.""" return self.k @property def epiatlas_dataset(self) -> EpiAtlasDataset: """Returns source EpiAtlasDataset.""" return self._epiatlas_dataset @property def classes(self) -> List[str]: """Returns classes.""" return self._classes @property def train_val_dset(self) -> data.KnownData: """Returns training dataset for cross-validation.""" return self._train_val @property def test_dset(self) -> data.KnownData: """Returns test dataset, not used in cross-validation.""" return self._test @staticmethod def _label_uuid(dset: data.KnownData) -> Tuple[NDArray, NDArray, NDArrayInt]: """Return uuids, unique uuids and uuid to int mapping (for stratified group k-fold) Args: dset (data.KnownData): The dataset from which the UUIDs are to be extracted. Returns: Tuple[np.ndarray, np.ndarray, np.ndarray]: - uuids (np.ndarray): All the UUIDs for the dataset's samples. Length n. - unique_uuids (np.ndarray): Unique UUIDs present in the dataset. - uuid_to_int (np.ndarray): The indices to reconstruct the original array from the unique array. Length n. """ uuids = [dset.metadata[md5]["uuid"] for md5 in dset.ids] unique_uuids, uuid_to_int = np.unique(uuids, return_inverse=True) # type: ignore return np.array(uuids), unique_uuids, uuid_to_int def _reserve_test(self) -> Tuple[data.KnownData, data.KnownData]: """Return training data from cross-validation and test data for final evaluation.""" dset = self._epiatlas_dataset.dataset if self.test_ratio == 0: return dset, data.KnownData.empty_collection() n_splits = int(1 / self.test_ratio) if self.epiatlas_dataset.target_category == "track_type": train_val, test = next(self._split_by_track_type(dset, n_splits)) else: train_val, test = next(self._split_dataset(dset, n_splits, oversample=False)) return train_val, test def _split_by_track_type( self, dset: data.KnownData, n_splits: int ) -> Generator[Tuple[data.KnownData, data.KnownData], None, None]: """Split dataset by track_type. Oversampling not implemented.""" _, _, uuids_inverse = self._label_uuid(dset) # forcing track type as the class label labels = [dset.metadata[md5]["track_type"] for md5 in dset.ids] skf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42) for train_idxs, valid_idxs in skf.split( X=dset.signals, y=labels, groups=uuids_inverse ): train_set = dset.subsample(list(train_idxs)) valid_set = dset.subsample(list(valid_idxs)) yield train_set, valid_set def _split_dataset( self, dset: data.KnownData, n_splits: int, oversample: bool = False ) -> Generator[Tuple[data.KnownData, data.KnownData], None, None]: # Convert the labels and groups (uuids) into numpy arrays uuids, uuids_unique, uuids_inverse = self._label_uuid(dset) labels_unique = [ dset.encoded_labels[uuids == uuid][0] for uuid in uuids_unique ] # assuming all samples from the same UUID share the same label --> not true for track_type skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42) for train_idxs_unique, valid_idxs_unique in skf.split( X=np.empty(shape=(len(uuids_unique), dset.signals.shape[1])), y=labels_unique, ): train_idxs: NDArrayInt = np.concatenate( [np.where(uuids_inverse == idx)[0] for idx in train_idxs_unique] ) valid_idxs: NDArrayInt = np.concatenate( [np.where(uuids_inverse == idx)[0] for idx in valid_idxs_unique] ) if oversample: # Oversample in the UUID space, not the sample space ros = RandomOverSampler(random_state=42) train_uuids_resampled, _ = ros.fit_resample( # type: ignore np.array(uuids_unique[train_idxs_unique]).reshape(-1, 1), np.array(labels_unique)[train_idxs_unique], ) # map back to the sample space train_idxs: NDArrayInt = np.concatenate( [ np.where(uuids == uuid)[0] for uuid in train_uuids_resampled.flatten() # type: ignore ] ) train_set = dset.subsample(list(train_idxs)) valid_set = dset.subsample(list(valid_idxs)) yield train_set, valid_set def yield_split(self, oversample: bool = True) -> Generator[data.DataSet, None, None]: """Yield train and valid tensor datasets for one split. Depends on given init parameters. """ dset = self._train_val if self.epiatlas_dataset.target_category == "track_type": generator = self._split_by_track_type(dset, self.k) else: generator = self._split_dataset(dset, self.k, oversample=oversample) for train_set, valid_set in generator: yield data.DataSet( training=train_set, validation=valid_set, test=data.KnownData.empty_collection(), sorted_classes=self.classes, ) def create_total_data(self, oversample: bool = True) -> data.KnownData: """Create a single dataset from the training and validation data. Will not oversample properly if all samples from the same UUID do not share target label. Used for final training, with no validation. """ train_set = self._train_val # Convert the labels and groups (uuids) into numpy arrays uuids, uuids_unique, uuids_inverse = self._label_uuid(train_set) labels_unique = [ train_set.encoded_labels[uuids == uuid][0] for uuid in uuids_unique ] # assuming all samples from the same UUID share the same label --> not true for track_type if oversample: # Oversample in the UUID space, not the sample space ros = RandomOverSampler(random_state=42) resampled_uuid_idxs, _ = ros.fit_resample( # type: ignore np.array(range(len(uuids_unique))).reshape(-1, 1), np.array(labels_unique), ) resampled_uuid_idxs = resampled_uuid_idxs.flatten() # type: ignore # Map back to the sample space train_idxs = np.concatenate( [np.where(uuids_inverse == idx)[0] for idx in resampled_uuid_idxs] ) train_set = train_set.subsample(list(train_idxs)) return train_set # TODO: needed for tune_estimator # def split( # self, # total_data: data.KnownData, # X=None, # y=None, # groups=None, # ) -> Generator[tuple[List, List], None, None]: # """Generate indices to split total data into training and validation set. # Indexes match positions in output of create_total_data() # X, y and groups : # Always ignored, exist for compatibility. # """ # md5_mapping = {md5: i for i, md5 in enumerate(total_data.ids)} # raw_dset = self.epiatlas_dataset.raw_dataset # skf = StratifiedKFold(n_splits=self.k, shuffle=False) # for train_idxs, valid_idxs in skf.split( # np.zeros((raw_dset.train.num_examples, len(self.classes))), # list(raw_dset.train.encoded_labels), # ): # # The "complete" refers to the fact that the indexes are sampling over total data. # complete_train_idxs = self._find_other_tracks( # train_idxs, self._raw_dset.train, resample=True, md5_mapping=md5_mapping # type: ignore # ) # complete_valid_idxs = self._find_other_tracks( # valid_idxs, self._raw_dset.train, resample=False, md5_mapping=md5_mapping # type: ignore # ) # yield complete_train_idxs, complete_valid_idxs
Static methods
def from_datasource(datasource: EpiDataSource, label_category: str, label_list: List[str] | None = None, min_class_size: int = 10, test_ratio: float = 0, n_fold: int = 10, md5_list: List[str] | None = None, force_filter: bool = True, metadata: UUIDMetadata | None = None)
-
Create EpiAtlasFoldFactory from a given EpiDataSource, directly create the intermediary EpiAtlasDataset. See EpiAtlasDataset init parameters for more details.
Instance variables
prop classes : List[str]
-
Returns classes.
Expand source code
@property def classes(self) -> List[str]: """Returns classes.""" return self._classes
prop epiatlas_dataset : EpiAtlasDataset
-
Returns source EpiAtlasDataset.
Expand source code
@property def epiatlas_dataset(self) -> EpiAtlasDataset: """Returns source EpiAtlasDataset.""" return self._epiatlas_dataset
prop n_fold : int
-
Returns expected number of folds.
Expand source code
@property def n_fold(self) -> int: """Returns expected number of folds.""" return self.k
prop test_dset : data.KnownData
-
Returns test dataset, not used in cross-validation.
Expand source code
@property def test_dset(self) -> data.KnownData: """Returns test dataset, not used in cross-validation.""" return self._test
prop train_val_dset : data.KnownData
-
Returns training dataset for cross-validation.
Expand source code
@property def train_val_dset(self) -> data.KnownData: """Returns training dataset for cross-validation.""" return self._train_val
Methods
def create_total_data(self, oversample: bool = True) ‑> KnownData
-
Create a single dataset from the training and validation data.
Will not oversample properly if all samples from the same UUID do not share target label.
Used for final training, with no validation.
def yield_split(self, oversample: bool = True) ‑> Generator[DataSet, None, None]
-
Yield train and valid tensor datasets for one split.
Depends on given init parameters.
class EpiAtlasMetadata (datasource: EpiDataSource, label_category: str, label_list: List[str] | None = None, min_class_size: int = 10, md5_list: List[str] | None = None, force_filter: bool = True, metadata: UUIDMetadata | None = None)
-
Class that handles how epiatlas data ids are linked together.
Parameters
datasource
:EpiDataSource
- Where everything is read from.
label_category
:str
- The target category of labels to use.
label_list
:List[str]
, optional- List of labels/classes to include from given category
min_class_size
:int
, optional- Minimum number of samples per class.
md5_list
:List[str]
, optional- List of datasource md5s to include in the dataset. If None, everything is used and usual filter methods are used. (using min_class_size and label_list)
force_filter
:bool
, optional- If True, will filter the metadata even if md5_list is given. If False, will not filter the metadata if md5_list.
metadata
:UUIDMetadata
, optional- If given, will use this metadata instead of loading it from the datasource.
Expand source code
class EpiAtlasMetadata(EpiAtlasDataset): """Class that handles how epiatlas data ids are linked together. Parameters ---------- datasource : EpiDataSource Where everything is read from. label_category : str The target category of labels to use. label_list : List[str], optional List of labels/classes to include from given category min_class_size : int, optional Minimum number of samples per class. md5_list : List[str], optional List of datasource md5s to include in the dataset. If None, everything is used and usual filter methods are used. (using min_class_size and label_list) force_filter : bool, optional If True, will filter the metadata even if md5_list is given. If False, will not filter the metadata if md5_list. metadata : UUIDMetadata, optional If given, will use this metadata instead of loading it from the datasource. """ def _load_signals(self) -> Dict[str, np.ndarray]: """Load empty signals as no signals are needed for metadata.""" return {md5: np.ndarray(0) for md5 in self._metadata.md5s}
Ancestors
Inherited members