Module epiclass.core.estimators
Module for wrappers around simple sklearn machine learning estimators.
Functions
def best_params_cb(result)
-
BayesSearchCV callback
def get_model_name(filepath: str) ‑> str
-
Extract model name from filepath. (string before first '_')
def init_lock(l)
-
Define a global lock
def log_tune_results(logdir: Path, name: str, opt: BayesSearchCV)
-
It takes the results of a parameter optimization run and saves them to a CSV file.
Args
logdir
:Path
- The directory where the results will be saved.
name
:str
- The name of the model.
opt
:BayesSearchCV
- Optimizer after tuning.
def optimize_estimator(ea_handler: EpiAtlasFoldFactory, logdir: Path, n_iter: int, name: str, concurrent_cv: int = 1)
-
It takes a dataset and model name, and then it optimizes the model with the given name using the search space with the same name.
Args
ea_handler
:EpiAtlasFoldFactory
- Dataset splits creator.
logdir
:Path
- The directory where the results will be saved.
n_iter
:int
- Number of different search space sampling.
name
:str
- The name of the model we're tuning.
concurrent_cv
:int
- Number of full cross-validation process (X folds) to run
in parallel. Defaults to 1.
def run_prediction(i: int, my_data: DataSet, estimator: Pipeline, name: str, logdir: Path, verbose=True, save_model=True)
-
It takes a dataset, fits the model on the training data, and then predicts on the validation data
Args
i
:int
- the index of the split
my_data
:DataSet
- DataSet
estimator
:Pipeline
- The model to use.
name
:str
- The name of the model.
logdir
:Path
- The directory where the results will be saved.
verbose
- Whether to print out the metrics. Defaults to True
def run_predictions(ea_handler: EpiAtlasFoldFactory, estimator: Pipeline, name: str, logdir: Path)
-
It will fit and run a prediction for each of the k-folds in the EpiAtlasFoldFactory object, using the estimator provided. Will use all available cpus.
Args
ea_handler
:EpiAtlasFoldFactory
- Dataset splits creator.
estimator
:Pipeline
- The model to use.
name
:str
- The name of the model.
logdir
:Path
- The directory where the results will be saved.
def tune_estimator(model: Pipeline, ea_handler: EpiAtlasFoldFactory, params: dict, n_iter: int, concurrent_cv: int, n_jobs: int | None = None)
-
Apply Bayesian optimization on model, over hyperparameters search space.
Args
model
:Pipeline
- The model to tune.
ea_handler
:EpiAtlasFoldFactory
- Dataset splits creator.
params
:dict
- Hyperparameters search space.
n_iter
:int
- Total number of parameter settings to sample.
concurrent_cv
:int
- Number of full cross-validation process (X folds) to run
in parallel. n_jobs (int | None): Number of jobs to run in parallel. Max NFOLD_TUNE * concurrent_cv.
Returns
A BayesSearchCV object
Classes
class EstimatorAnalyzer (classes: Collection[str], estimator)
-
Generic class to analyze results given by an estimator.
Expand source code
class EstimatorAnalyzer: """Generic class to analyze results given by an estimator.""" def __init__(self, classes: Collection[str], estimator): self.classes = sorted(classes) self.mapping = dict(enumerate(self.classes)) self.encoder = LabelBinarizer().fit(list(self.mapping.keys())) self._clf = estimator self._name = self._get_name(estimator) @staticmethod def _get_name(estimator) -> str: """Return estimator model name.""" name = type(estimator).__name__ if name == "Pipeline": name = type(estimator.named_steps["model"]).__name__ return name @property def name(self) -> str: """Return classifier name.""" return self._get_name(self._clf) @property def classifier(self): """Return classifier instance.""" return self._clf def predict(self, X): """Return class predictions.""" if self.name == "LGBMClassifier": pred_results = self._clf.predict( X, raw_score=False, pred_leaf=False, pred_contrib=False ) else: pred_results = self._clf.predict(X) return pred_results def predict_proba(self, X): """Return class prediction probabilities.""" try: if self.name == "LGBMClassifier": pred_results = self._clf.predict_proba( X, raw_score=False, pred_leaf=False, pred_contrib=False ) else: pred_results = self._clf.predict_proba(X) except AttributeError: # SVM int_results = self._clf.predict(X) pred_results = self.encoder.transform(int_results) if pred_results.shape[1] == 1: # 2 classes, e.g. sex pred_results = [[1, 0] if i == 0 else [0, 1] for i in int_results] return pred_results def metrics(self, X, y, verbose=True): """Return a dict of metrics over given set""" y_pred = self.predict(X) y_true = y val_acc = sklearn.metrics.accuracy_score(y_true, y_pred) val_precision = sklearn.metrics.precision_score(y_true, y_pred, average="macro") val_recall = sklearn.metrics.recall_score(y_true, y_pred, average="macro") val_f1 = sklearn.metrics.f1_score(y_true, y_pred, average="macro") val_mcc = sklearn.metrics.matthews_corrcoef(y_true, y_pred) metrics_dict = { "val_acc": val_acc, "val_precision": val_precision, "val_recall": val_recall, "val_f1": val_f1, "val_mcc": val_mcc, } if verbose: EstimatorAnalyzer.print_metrics(metrics_dict) return metrics_dict @staticmethod def print_metrics(metrics_dict: dict): """Print metrics""" print(f"Validation Accuracy: {metrics_dict['val_acc']}") print(f"Validation Precision: {metrics_dict['val_precision']}") print(f"Validation Recall: {metrics_dict['val_recall']}") print(f"Validation F1_score: {metrics_dict['val_f1']}") print(f"Validation MCC: {metrics_dict['val_mcc']}") def predict_file(self, ids, X, y, log): """Write predictions table for validation set. ids: Sample identifier. X: Sample features y: Sample labels log: path where to save predictions """ pred_results = self.predict_proba(X) str_preds = [ self.mapping[encoded_label] for encoded_label in np.argmax(pred_results, axis=1) # type: ignore ] str_y = [self.mapping[encoded_label] for encoded_label in y] write_pred_table( predictions=pred_results, str_preds=str_preds, str_targets=str_y, classes=self.classes, md5s=ids, path=log, ) def save_model(self, logdir: Path, name=None): """Save model to pickle file. If a filename is given, it will be appended to model name.""" save_name = f"{self._name}" if name is not None: save_name += f"_{name}" time = str(time_now()).replace(" ", "_") save_name = logdir / f"{save_name}_{time}.pickle" print(f"Saving model to {save_name}") with open(save_name, "wb") as f: pickle.dump(self, f) @classmethod def restore_model_from_name(cls, logdir: str, auto_name: str) -> EstimatorAnalyzer: """Restore most recent EstimatorAnalyzer instance from a previous save. auto_name is the cli name of the model. """ if auto_name not in save_mapping: raise ValueError(f"Expected a cli model name (restricted). Gave: {auto_name}") name = save_mapping[auto_name] path = Path(logdir) / f"{name}*.pickle" list_of_files = glob.glob(str(path)) try: filepath = max(list_of_files, key=os.path.getctime) except ValueError as err: print( f"Did not find any model file following pattern {path}", file=sys.stderr, ) raise err return EstimatorAnalyzer.restore_model_from_path(filepath) @classmethod def restore_model_from_path(cls, full_path: str) -> EstimatorAnalyzer: """Restore EstimatorAnalyzer instance from a previous pickle save.""" print(f"Loading model {full_path}") with open(full_path, "rb") as f: return pickle.load(f)
Static methods
def print_metrics(metrics_dict: dict)
-
Print metrics
def restore_model_from_name(logdir: str, auto_name: str) ‑> EstimatorAnalyzer
-
Restore most recent EstimatorAnalyzer instance from a previous save.
auto_name is the cli name of the model.
def restore_model_from_path(full_path: str) ‑> EstimatorAnalyzer
-
Restore EstimatorAnalyzer instance from a previous pickle save.
Instance variables
prop classifier
-
Return classifier instance.
Expand source code
@property def classifier(self): """Return classifier instance.""" return self._clf
prop name : str
-
Return classifier name.
Expand source code
@property def name(self) -> str: """Return classifier name.""" return self._get_name(self._clf)
Methods
def metrics(self, X, y, verbose=True)
-
Return a dict of metrics over given set
def predict(self, X)
-
Return class predictions.
def predict_file(self, ids, X, y, log)
-
Write predictions table for validation set.
ids: Sample identifier. X: Sample features y: Sample labels log: path where to save predictions
def predict_proba(self, X)
-
Return class prediction probabilities.
def save_model(self, logdir: Path, name=None)
-
Save model to pickle file. If a filename is given, it will be appended to model name.