Module epiclass.utils.extract_output_info
Extract specific information from what epiML prints.
Useful to get several results for the Excel sheet at once.
Functions
def main()
-
Miaw
def parse_arguments() ‑> argparse.Namespace
-
argument parser for command line
Classes
class EpiMLOutputReader
-
Read epiML output files and extract their information.
Expand source code
class EpiMLOutputReader: """Read epiML output files and extract their information.""" SIZE_TOKENS = frozenset(["training", "validation", "test"]) EXAMPLES_TOKEN = "Examples" TRAINING_TOKEN = "epoch" METRICS_TOKENS = frozenset(["Training", "Validation", "Test"]) HYPERPARAMS_TOKENS = frozenset( [ "Nb", "Layers", "batch_size:", "early_stop_limit:", "is_training:", "keep_prob:", "l1_scale:", "l2_scale:", "learning_rate:", "measure_frequency:", "training_epochs:", ] ) METRICS = frozenset(["Accuracy", "Precision", "Recall", "f1_score", "MCC"]) def __init__(self): self._file = None self._info = {} self._hyperparams_fields = self._init_hyperparams_fields() self._tokens = self._init_tokens() self._current_line = "" def _init_tokens(self): """Return set of all tokens""" return set([self.EXAMPLES_TOKEN, self.TRAINING_TOKEN]).union( self.SIZE_TOKENS, self.METRICS_TOKENS, self.HYPERPARAMS_TOKENS ) def _init_hyperparams_fields(self): """Return token:field_name dict for hyperparameters.""" fields = {token: token.strip(":") for token in self.HYPERPARAMS_TOKENS} fields.update({"Nb": "nb_layers", "Layers": "layers_size"}) return fields def print_info(self, fields=None): """Print info in order of given fields. Print all info with keys if no field is given.""" if fields is None: for key, val in sorted(self._info.items()): print(f"{key} : {val}") else: infos = [self._info.get(field, "--") for field in fields] print("\t".join(infos)) def read_file(self, file): """Read file and extract important information.""" self._file = open(file, "r", encoding="utf-8") self._info = {} # empty if another file was read before while True: try: self._next_line() except StopIteration: self._current_line = "" break first_word = self._get_current_first_word() if first_word in self._tokens: self._read_section(first_word) self._file.close() def _get_current_first_word(self): """Return string before first split on a space.""" return self._current_line.rstrip("\n").split(" ", 1)[0] def _next_line(self): """Advance to next file line.""" self._current_line = next(self._file) def _read_section(self, token): """Choose section reading method.""" if token in self.HYPERPARAMS_TOKENS: self._read_hyperparams() elif token in self.SIZE_TOKENS: self._read_set_size() elif token in self.METRICS_TOKENS: self._read_metrics() elif token == self.TRAINING_TOKEN: self._read_training() elif token == self.EXAMPLES_TOKEN: self._read_examples() else: raise InvalidTokenError(f"Invalid token: {token}") def _read_hyperparams(self): """Extract hyperparameters from multiple lines""" while True: first_word = self._get_current_first_word() if first_word in self._hyperparams_fields: field_name = self._hyperparams_fields[first_word] field_info = self._current_line.strip("\n").split(" ")[-1] self._info[field_name] = field_info self._next_line() else: break def _read_set_size(self): """Extract set size from "[SetName] size [SetSize]" line.""" dataset, word, size = self._current_line.strip("\n").split(" ") if word == "size": self._info[f"{dataset}_size"] = size else: raise InvalidTokenError( f"Not a set size section. Problematic token:{dataset}" ) def _read_examples(self): """Extract the total number of examples from "For a total of [Nb] examples" line.""" while True: self._next_line() if self._get_current_first_word() == "For": self._info["nb_examples"] = self._current_line.strip("\n").split(" ")[4] break def _read_training(self): """Extract last epoch number and date from "epoch [Nb], batch training accuracy [float], validation accuracy [float] [timestamp]" lines. Also extract training time if present just after. """ self._info["date"] = self._current_line.split(" ")[-2] last_epoch = "" while True: self._next_line() first_word, epoch, _ = self._current_line.split(" ", 2) if first_word == "epoch": last_epoch = epoch else: self._info["last_epoch"] = last_epoch.strip(",") break if self._is_training_time_line(): self._read_training_time() else: print("Training time not present after training. Continuing") def _is_training_time_line(self): """Return boolean based on if the line gives the training time.""" first_word, second_word = self._current_line.split(" ")[0:2] return first_word == "training" and second_word == "time:" def _read_training_time(self): """Extract training time from "training time: [timedelta]" line.""" match = re.search(r"(\w{1,2}):(\w{1,2}):(\w{2}).", self._current_line) if match: self._info["training_time"] = "{}h{}m{}s".format(*match.groups()) def _read_metrics(self): """Extract metrics from multiple lines.""" dataset = self._get_current_first_word() while True: self._next_line() line = self._current_line.strip("\n").split(":") first_word = line[0] if first_word in self.METRICS: field_name = f"{dataset}_{first_word}".lower() self._info[field_name] = line[1].strip(" ") # another metrics section elif self._get_current_first_word() in self.METRICS_TOKENS: self._read_metrics() else: break
Class variables
var EXAMPLES_TOKEN
var HYPERPARAMS_TOKENS
var METRICS
var METRICS_TOKENS
var SIZE_TOKENS
var TRAINING_TOKEN
Methods
def print_info(self, fields=None)
-
Print info in order of given fields. Print all info with keys if no field is given.
def read_file(self, file)
-
Read file and extract important information.
class InvalidTokenError (*args, **kwargs)
-
Raised when the token is not valid.
Expand source code
class InvalidTokenError(Exception): """Raised when the token is not valid."""
Ancestors
- builtins.Exception
- builtins.BaseException