Module epiclass.core.hdf5_loader
Module for hdf5 loading handling.
Classes
class Hdf5Loader (chrom_file: Path | str, normalization: bool)
-
Handles loading/creating signals from hdf5 files
Expand source code
class Hdf5Loader: """Handles loading/creating signals from hdf5 files""" def __init__(self, chrom_file: Path | str, normalization: bool): self._normalization = normalization self._chroms = Hdf5Loader.load_chroms(chrom_file) self._files = {} self._signals = {} @property def loaded_files(self) -> Dict[str, Path]: """Return a {md5:path} dict with last loaded files.""" return self._files @property def signals(self) -> Dict[str, np.ndarray]: """Return a {md5:signal} dict with the last loaded signals, where the signal has concanenated chromosomes, and is normalized if set so. """ return self._signals @staticmethod def load_chroms(chrom_file: Path | str) -> List[str]: """Return sorted chromosome names list.""" with open(chrom_file, "r", encoding="utf-8") as file: chroms = [] for line in file: line = line.rstrip() if line: chroms.append(line.split()[0]) chroms.sort() return chroms @staticmethod def read_list(data_file: Path, adapt: bool = False) -> Dict[str, Path]: """Return {md5:file} dict from file of paths list.""" with open(data_file, "r", encoding="utf-8") as file_of_paths: files = {} for path in file_of_paths: path = Path(path.rstrip()) files[Hdf5Loader.extract_md5(path)] = path if adapt: files = Hdf5Loader.adapt_to_environment(files) return files def load_hdf5s( self, data_file: Path, md5s: List[str] | None = None, verbose=True, strict=False, hdf5_dir: Path | None = None, ) -> Hdf5Loader: """Load hdf5s from path list file, into self.signals If a list of md5s is given, load only the corresponding files. Normalize if internal flag set so. Check adapt_to_environment for details on dynamic path changes. hdf5_dir is a directory in which to look for hdf5s, which will override data_file complete paths. If strict, will raise OSError if an hdf5 cannot be opened. Loads them as float32. """ files = self.read_list(data_file) files = Hdf5Loader.adapt_to_environment(files) if hdf5_dir is not None: files = {md5: hdf5_dir / path.name for md5, path in files.items()} self._files = files # Remove undesired files if md5s is not None: chosen_md5s = set(md5s) # fmt: off files = { md5: path for md5, path in files.items() if md5 in chosen_md5s } # fmt: on absent_md5s = chosen_md5s - set(files.keys()) if absent_md5s and verbose: print("Following given md5s are absent of hdf5 list") for md5 in absent_md5s: print(md5) # Load hdf5s and concatenate chroms into signals signals = {} for md5, file in files.items(): # Trying to open hdf5 file. try: with h5py.File(file, "r") as f: signals[md5] = self._normalize(self._read_hdf5(f, md5)) except (OSError, FloatingPointError) as err: print(f"Error occured with {md5}: {file}. {err}", file=sys.stderr) if strict: print( "Strict hdf5 loading policy true, raising original error.", file=sys.stderr, ) raise err from None continue self._signals = signals return self def _read_hdf5(self, file: h5py.File, md5: str) -> np.ndarray: """Read and return concatenated genome signal for open hdf5 file.""" try: header = list(file.keys())[0] except IndexError as e: raise OSError(f"Header not found in {md5}") from e hdf5_data = file[header] chrom_signals = [hdf5_data[chrom][...] for chrom in self._chroms] # type: ignore return np.concatenate(chrom_signals, dtype=np.float32) # type: ignore def _normalize(self, array: np.ndarray) -> np.ndarray: """Normalize array if internal flag set so. If normalization is not set, return array as is. Raises: FloatingPointError: if operation fails """ if self._normalization: with np.errstate(all="raise"): return (array - array.mean()) / array.std() return array @staticmethod def extract_md5(file_name: Path, verbose: bool = False) -> str: """Extract the md5 string from file path with specific naming convention. Expecting the md5 to be the first part of the file name, separated by an underscore. If there is no md5sum, extract the filename without extension. """ md5 = file_name.name.split("_")[0] if len(md5) != 32: if verbose: print( f"Warning: '{file_name}' does not begin with a md5sum.", file=sys.stderr, ) return file_name.stem return file_name.name.split("_")[0] @staticmethod def adapt_to_environment(files: Dict[str, Path]) -> Dict[str, Path]: """Change files paths if they exist on cluster scratch. Checks for $SLURM_TMPDIR/$HDF5_PARENT, default is $SLURM_TMPDIR/hdf5s. Files : {md5:path} dict. """ local_tmp = Path(os.getenv("SLURM_TMPDIR", "./bleh")) local_tmp = local_tmp / os.getenv("HDF5_PARENT", "hdf5s") if local_tmp.exists(): print(f"Using files in {local_tmp}") for md5, path in list(files.items()): files[md5] = local_tmp / Path(path).name return files
Static methods
def adapt_to_environment(files: Dict[str, Path]) ‑> Dict[str, pathlib.Path]
-
Change files paths if they exist on cluster scratch. Checks for $SLURM_TMPDIR/$HDF5_PARENT, default is $SLURM_TMPDIR/hdf5s.
Files : {md5:path} dict.
def extract_md5(file_name: Path, verbose: bool = False) ‑> str
-
Extract the md5 string from file path with specific naming convention.
Expecting the md5 to be the first part of the file name, separated by an underscore.
If there is no md5sum, extract the filename without extension.
def load_chroms(chrom_file: Path | str)
-
Return sorted chromosome names list.
def read_list(data_file: Path, adapt: bool = False) ‑> Dict[str, pathlib.Path]
-
Return {md5:file} dict from file of paths list.
Instance variables
prop loaded_files : Dict[str, Path]
-
Return a {md5:path} dict with last loaded files.
Expand source code
@property def loaded_files(self) -> Dict[str, Path]: """Return a {md5:path} dict with last loaded files.""" return self._files
prop signals : Dict[str, np.ndarray]
-
Return a {md5:signal} dict with the last loaded signals, where the signal has concanenated chromosomes, and is normalized if set so.
Expand source code
@property def signals(self) -> Dict[str, np.ndarray]: """Return a {md5:signal} dict with the last loaded signals, where the signal has concanenated chromosomes, and is normalized if set so. """ return self._signals
Methods
def load_hdf5s(self, data_file: Path, md5s: List[str] | None = None, verbose=True, strict=False, hdf5_dir: Path | None = None)
-
Load hdf5s from path list file, into self.signals If a list of md5s is given, load only the corresponding files. Normalize if internal flag set so.
Check adapt_to_environment for details on dynamic path changes.
hdf5_dir is a directory in which to look for hdf5s, which will override data_file complete paths.
If strict, will raise OSError if an hdf5 cannot be opened.
Loads them as float32.