Module epiclass.core.estimators

Module for wrappers around simple sklearn machine learning estimators.

Functions

def best_params_cb(result)

BayesSearchCV callback

def get_model_name(filepath: str) ‑> str

Extract model name from filepath. (string before first '_')

def init_lock(l)

Define a global lock

def log_tune_results(logdir: Path, name: str, opt: BayesSearchCV)

It takes the results of a parameter optimization run and saves them to a CSV file.

Args

logdir : Path
The directory where the results will be saved.
name : str
The name of the model.
opt : BayesSearchCV
Optimizer after tuning.
def optimize_estimator(ea_handler: EpiAtlasFoldFactory, logdir: Path, n_iter: int, name: str, concurrent_cv: int = 1)

It takes a dataset and model name, and then it optimizes the model with the given name using the search space with the same name.

Args

ea_handler : EpiAtlasFoldFactory
Dataset splits creator.
logdir : Path
The directory where the results will be saved.
n_iter : int
Number of different search space sampling.
name : str
The name of the model we're tuning.
concurrent_cv : int
Number of full cross-validation process (X folds) to run

in parallel. Defaults to 1.

def run_prediction(i: int, my_data: DataSet, estimator: Pipeline, name: str, logdir: Path, verbose=True, save_model=True)

It takes a dataset, fits the model on the training data, and then predicts on the validation data

Args

i : int
the index of the split
my_data : DataSet
DataSet
estimator : Pipeline
The model to use.
name : str
The name of the model.
logdir : Path
The directory where the results will be saved.
verbose
Whether to print out the metrics. Defaults to True
def run_predictions(ea_handler: EpiAtlasFoldFactory, estimator: Pipeline, name: str, logdir: Path)

It will fit and run a prediction for each of the k-folds in the EpiAtlasFoldFactory object, using the estimator provided. Will use all available cpus.

Args

ea_handler : EpiAtlasFoldFactory
Dataset splits creator.
estimator : Pipeline
The model to use.
name : str
The name of the model.
logdir : Path
The directory where the results will be saved.
def tune_estimator(model: Pipeline, ea_handler: EpiAtlasFoldFactory, params: dict, n_iter: int, concurrent_cv: int, n_jobs: int | None = None)

Apply Bayesian optimization on model, over hyperparameters search space.

Args

model : Pipeline
The model to tune.
ea_handler : EpiAtlasFoldFactory
Dataset splits creator.
params : dict
Hyperparameters search space.
n_iter : int
Total number of parameter settings to sample.
concurrent_cv : int
Number of full cross-validation process (X folds) to run

in parallel. n_jobs (int | None): Number of jobs to run in parallel. Max NFOLD_TUNE * concurrent_cv.

Returns

A BayesSearchCV object

Classes

class EstimatorAnalyzer (classes: Collection[str], estimator)

Generic class to analyze results given by an estimator.

Expand source code
class EstimatorAnalyzer:
    """Generic class to analyze results given by an estimator."""

    def __init__(self, classes: Collection[str], estimator):
        self.classes = sorted(classes)
        self.mapping = dict(enumerate(self.classes))
        self.encoder = LabelBinarizer().fit(list(self.mapping.keys()))

        self._clf = estimator
        self._name = self._get_name(estimator)

    @staticmethod
    def _get_name(estimator) -> str:
        """Return estimator model name."""
        name = type(estimator).__name__
        if name == "Pipeline":
            name = type(estimator.named_steps["model"]).__name__
        return name

    @property
    def name(self) -> str:
        """Return classifier name."""
        return self._get_name(self._clf)

    @property
    def classifier(self):
        """Return classifier instance."""
        return self._clf

    def predict(self, X):
        """Return class predictions."""
        if self.name == "LGBMClassifier":
            pred_results = self._clf.predict(
                X, raw_score=False, pred_leaf=False, pred_contrib=False
            )
        else:
            pred_results = self._clf.predict(X)
        return pred_results

    def predict_proba(self, X):
        """Return class prediction probabilities."""
        try:
            if self.name == "LGBMClassifier":
                pred_results = self._clf.predict_proba(
                    X, raw_score=False, pred_leaf=False, pred_contrib=False
                )
            else:
                pred_results = self._clf.predict_proba(X)
        except AttributeError:  # SVM
            int_results = self._clf.predict(X)
            pred_results = self.encoder.transform(int_results)
            if pred_results.shape[1] == 1:  # 2 classes, e.g. sex
                pred_results = [[1, 0] if i == 0 else [0, 1] for i in int_results]

        return pred_results

    def metrics(self, X, y, verbose=True):
        """Return a dict of metrics over given set"""
        y_pred = self.predict(X)
        y_true = y

        val_acc = sklearn.metrics.accuracy_score(y_true, y_pred)
        val_precision = sklearn.metrics.precision_score(y_true, y_pred, average="macro")
        val_recall = sklearn.metrics.recall_score(y_true, y_pred, average="macro")
        val_f1 = sklearn.metrics.f1_score(y_true, y_pred, average="macro")
        val_mcc = sklearn.metrics.matthews_corrcoef(y_true, y_pred)

        metrics_dict = {
            "val_acc": val_acc,
            "val_precision": val_precision,
            "val_recall": val_recall,
            "val_f1": val_f1,
            "val_mcc": val_mcc,
        }

        if verbose:
            EstimatorAnalyzer.print_metrics(metrics_dict)

        return metrics_dict

    @staticmethod
    def print_metrics(metrics_dict: dict):
        """Print metrics"""
        print(f"Validation Accuracy: {metrics_dict['val_acc']}")
        print(f"Validation Precision: {metrics_dict['val_precision']}")
        print(f"Validation Recall: {metrics_dict['val_recall']}")
        print(f"Validation F1_score: {metrics_dict['val_f1']}")
        print(f"Validation MCC: {metrics_dict['val_mcc']}")

    def predict_file(self, ids, X, y, log):
        """Write predictions table for validation set.

        ids: Sample identifier.
        X: Sample features
        y: Sample labels
        log: path where to save predictions
        """

        pred_results = self.predict_proba(X)

        str_preds = [
            self.mapping[encoded_label] for encoded_label in np.argmax(pred_results, axis=1)  # type: ignore
        ]

        str_y = [self.mapping[encoded_label] for encoded_label in y]

        write_pred_table(
            predictions=pred_results,
            str_preds=str_preds,
            str_targets=str_y,
            classes=self.classes,
            md5s=ids,
            path=log,
        )

    def save_model(self, logdir: Path, name=None):
        """Save model to pickle file. If a filename is given, it will be appended to model name."""
        save_name = f"{self._name}"
        if name is not None:
            save_name += f"_{name}"

        time = str(time_now()).replace(" ", "_")
        save_name = logdir / f"{save_name}_{time}.pickle"

        print(f"Saving model to {save_name}")
        with open(save_name, "wb") as f:
            pickle.dump(self, f)

    @classmethod
    def restore_model_from_name(cls, logdir: str, auto_name: str) -> EstimatorAnalyzer:
        """Restore most recent EstimatorAnalyzer instance from a previous save.

        auto_name is the cli name of the model.
        """
        if auto_name not in save_mapping:
            raise ValueError(f"Expected a cli model name (restricted). Gave: {auto_name}")

        name = save_mapping[auto_name]
        path = Path(logdir) / f"{name}*.pickle"
        list_of_files = glob.glob(str(path))
        try:
            filepath = max(list_of_files, key=os.path.getctime)
        except ValueError as err:
            print(
                f"Did not find any model file following pattern {path}",
                file=sys.stderr,
            )
            raise err

        return EstimatorAnalyzer.restore_model_from_path(filepath)

    @classmethod
    def restore_model_from_path(cls, full_path: str) -> EstimatorAnalyzer:
        """Restore EstimatorAnalyzer instance from a previous pickle save."""
        print(f"Loading model {full_path}")
        with open(full_path, "rb") as f:
            return pickle.load(f)

Static methods

def print_metrics(metrics_dict: dict)

Print metrics

def restore_model_from_name(logdir: str, auto_name: str) ‑> EstimatorAnalyzer

Restore most recent EstimatorAnalyzer instance from a previous save.

auto_name is the cli name of the model.

def restore_model_from_path(full_path: str) ‑> EstimatorAnalyzer

Restore EstimatorAnalyzer instance from a previous pickle save.

Instance variables

prop classifier

Return classifier instance.

Expand source code
@property
def classifier(self):
    """Return classifier instance."""
    return self._clf
prop name : str

Return classifier name.

Expand source code
@property
def name(self) -> str:
    """Return classifier name."""
    return self._get_name(self._clf)

Methods

def metrics(self, X, y, verbose=True)

Return a dict of metrics over given set

def predict(self, X)

Return class predictions.

def predict_file(self, ids, X, y, log)

Write predictions table for validation set.

ids: Sample identifier. X: Sample features y: Sample labels log: path where to save predictions

def predict_proba(self, X)

Return class prediction probabilities.

def save_model(self, logdir: Path, name=None)

Save model to pickle file. If a filename is given, it will be appended to model name.