From 737241ac7d3252da5cd2bbd3608a0eca71d2b79a Mon Sep 17 00:00:00 2001 From: Alcoholrithm Date: Sun, 31 Mar 2024 04:15:19 +0900 Subject: [PATCH] Fix a bug in XGBPipeLine of benchmark code and refactor test codes to use benchmark files --- .gitignore | 2 +- benchmark/pipelines/xgb_pipeline.py | 15 +- test/abalone.py | 33 -- test/diabetes.py | 24 -- test/misc.py | 21 ++ test/test_scarf.py | 470 ++------------------------ test/test_subtab.py | 492 ++-------------------------- test/test_vime.py | 482 ++------------------------- 8 files changed, 105 insertions(+), 1434 deletions(-) delete mode 100644 test/abalone.py delete mode 100644 test/diabetes.py create mode 100644 test/misc.py diff --git a/.gitignore b/.gitignore index ef58079..c2689cf 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ temporary_ckpt_data/ build/ dist/ .pytest_cache/ -benchmark/benchmark_ckpt/ +*benchmark_ckpt/ diff --git a/benchmark/pipelines/xgb_pipeline.py b/benchmark/pipelines/xgb_pipeline.py index 13bb253..a7ef3c5 100644 --- a/benchmark/pipelines/xgb_pipeline.py +++ b/benchmark/pipelines/xgb_pipeline.py @@ -21,14 +21,6 @@ class XGBConfig: scale_pos_weight: int early_stopping_rounds: int - - # # task: str = field(default=None) - - # def __post_init__(self): - # if self.task is None: - # raise ValueError("The task of the problem must be specified in the 'task' attribute.") - # elif (type(self.task) is not str or (self.task != "regression" and self.task != "classification")): - # raise ValueError(f"{self.task} is not a valid task. Choices are: ['regression', 'classification']") class XGBModule(object): def __init__(self, model_class: Union[XGBClassifier, XGBRegressor]): @@ -52,11 +44,9 @@ def initialize(self): self.hparams_range = hparams_range def _get_config(self, hparams: Dict[str, Any]): - # hparams["task"] = "regression" if self.output_dim == 1 else "classification" hparams["early_stopping_rounds"] = self.args.second_phase_patience return self.config_class(**hparams) - # return asdict(self.config_class(**hparams)) def fit_model(self, pl_module: XGBModule, config: XGBConfig): @@ -66,8 +56,9 @@ def fit_model(self, pl_module: XGBModule, config: XGBConfig): def evaluate(self, pl_module: XGBModule, config: XGBConfig, X: pd.DataFrame, y: pd.Series): - preds = pl_module.predict(X) - + preds = pl_module.predict_proba(X) + + # print(preds.shape, y.shape) score = self.metric(preds, y) return score diff --git a/test/abalone.py b/test/abalone.py deleted file mode 100644 index f1fc59b..0000000 --- a/test/abalone.py +++ /dev/null @@ -1,33 +0,0 @@ -from sklearn.datasets import fetch_openml -import numpy as np -from types import SimpleNamespace -from typing import Tuple, List -import pandas as pd -from sklearn.preprocessing import LabelEncoder -from sklearn.preprocessing import OneHotEncoder, MinMaxScaler - -def load_abalone(): - - abalone = fetch_openml(data_id = 44956, data_home='./data_cache') - - data = abalone.data - - label = abalone.target - - - category_cols = ["sex"] - continuous_cols = [] - for col in data.columns: - if not col in category_cols: - continuous_cols.append(col) - - le = LabelEncoder() - for col in category_cols: - data[col] = le.fit_transform(data[col]) - - scaler = MinMaxScaler() - data[continuous_cols] = scaler.fit_transform(data[continuous_cols]) - - - - return data, label, continuous_cols, category_cols \ No newline at end of file diff --git a/test/diabetes.py b/test/diabetes.py deleted file mode 100644 index 295d5e5..0000000 --- a/test/diabetes.py +++ /dev/null @@ -1,24 +0,0 @@ -from sklearn.datasets import fetch_openml -import numpy as np -from types import SimpleNamespace -from typing import Tuple, List -import pandas as pd -from sklearn.preprocessing import LabelEncoder -from sklearn.preprocessing import OneHotEncoder, MinMaxScaler - -def load_diabetes(): - - diabetes = fetch_openml(data_id = 37, data_home='./data_cache') - - data = diabetes.data - - le = LabelEncoder() - label = pd.Series(le.fit_transform(diabetes.target)) - - category_cols = [] - continuous_cols = list(map(str, data.columns)) - - scaler = MinMaxScaler() - data[continuous_cols] = scaler.fit_transform(data[continuous_cols]) - - return data, label, continuous_cols, category_cols \ No newline at end of file diff --git a/test/misc.py b/test/misc.py new file mode 100644 index 0000000..c0698dd --- /dev/null +++ b/test/misc.py @@ -0,0 +1,21 @@ +from types import SimpleNamespace + +def get_args(): + args = SimpleNamespace() + + args.max_epochs = 1 + args.first_phase_patience = 1 + args.second_phase_patience = 1 + args.n_trials = 1 + + args.labeled_sample_ratio = 1 + args.valid_size = 0.2 + args.test_size = 0.2 + args.random_seed = 0 + args.batch_size = 128 + + args.n_jobs = 4 + args.accelerator = "cpu" + args.devices = "auto" + + return args \ No newline at end of file diff --git a/test/test_scarf.py b/test/test_scarf.py index c16fad5..445ce65 100644 --- a/test/test_scarf.py +++ b/test/test_scarf.py @@ -1,448 +1,40 @@ - -def test_scarf_classification(): - from ts3l.pl_modules import SCARFLightning - from ts3l.models import SCARF - from ts3l.utils.scarf_utils import SCARFDataset - from ts3l.utils import TS3LDataModule - - import torch.nn as nn - - import sys - sys.path.append('.') - - from diabetes import load_diabetes - - data, label, continuous_cols, category_cols = load_diabetes() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = "CrossEntropyLoss" - metric = "accuracy_score" - metric_params = {} - random_seed = 0 - - - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed, stratify=label) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed, stratify=y_train) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = SCARFDataset(pd.concat([X_train, X_unlabeled]), config=config) - test_ds = SCARFDataset(X_valid, config=config) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size=batch_size, train_sampler="random") - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = SCARFLightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - - train_ds = SCARFDataset(X_train, y_train.values, is_second_phase=True) - test_ds = SCARFDataset(X_valid, y_valid.values, is_second_phase=True) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="weighted") - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] - - checkpoint_path = None +from misc import get_args - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) +import sys +import os +here = os.path.dirname(__file__) - callbacks.append(checkpoint_callback) +sys.path.append(os.path.join(here, '..')) - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = SCARFLightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 16, 512]], - 'encoder_depth' : ['suggest_int', ['encoder_depth', 2, 6]], - 'head_depth' : ['suggest_int', ['head_depth', 1, 3]], - 'corruption_rate' : ['suggest_float', ['corruption_rate', 0.1, 0.7]], - 'dropout_rate' : ['suggest_float', ['dropout_rate', 0.05, 0.3]], - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - import torch.nn.functional as F - from sklearn.metrics import accuracy_score - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - config = { - "input_dim" : data.shape[1], - "hidden_dim" : None, - "encoder_depth" : None, - "head_depth" : None, - 'dropout_rate' : None, - "output_dim" : 2, - "corruption_rate" : None - } - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - - from ts3l.utils.scarf_utils import SCARFConfig - config = SCARFConfig( - task="classification", - loss_fn=loss_fn, metric=metric, metric_hparams={}, - **config - ) - pl_scarf = SCARFLightning(config) - - pl_scarf = fit_model(pl_scarf, config) - - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = SCARFDataset(X_valid, is_second_phase=True) - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs) - - preds = trainer.predict(pl_scarf, test_dl) - - preds = F.softmax(torch.concat([out.cpu() for out in preds]).squeeze(),dim=1) - - accuracy = accuracy_score(y_valid, preds.argmax(1)) - - return accuracy - - study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" Accuracy: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) +def test_scarf_classification(): + + from benchmark.datasets import load_diabetes + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_diabetes() + + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) + + from benchmark.pipelines import SCARFPipeLine + + args = get_args() + + pipeline = SCARFPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + + pipeline.benchmark() def test_scarf_regression(): - from ts3l.pl_modules import SCARFLightning - from ts3l.models import SCARF - from ts3l.utils.scarf_utils import SCARFDataset - from ts3l.utils import TS3LDataModule - - import torch.nn as nn - - import sys - sys.path.append('.') - from abalone import load_abalone - - data, label, continuous_cols, category_cols = load_abalone() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = "MSELoss" - metric = "mean_squared_error" - metric_params = {} - random_seed = 0 - - - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = SCARFDataset(pd.concat([X_train, X_unlabeled]), config=config) - test_ds = SCARFDataset(X_valid, config=config) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size=batch_size, train_sampler="random") - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = SCARFLightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - train_ds = SCARFDataset(X_train, y_train.values, is_regression=True, is_second_phase=True) - test_ds = SCARFDataset(X_valid, y_valid.values, is_regression=True, is_second_phase=True) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="random") - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] - - checkpoint_path = None - - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = SCARFLightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 16, 512]], - 'encoder_depth' : ['suggest_int', ['encoder_depth', 2, 6]], - 'head_depth' : ['suggest_int', ['head_depth', 1, 3]], - 'corruption_rate' : ['suggest_float', ['corruption_rate', 0.1, 0.7]], - 'dropout_rate' : ['suggest_float', ['dropout_rate', 0.05, 0.3]], - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - from sklearn.metrics import mean_squared_error - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - config = { - "input_dim" : data.shape[1], - "hidden_dim" : None, - "encoder_depth" : None, - "head_depth" : None, - 'dropout_rate' : None, - "output_dim" : 1, - "corruption_rate" : None - } - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - - from ts3l.utils.scarf_utils import SCARFConfig - config = SCARFConfig( - task="regression", - loss_fn=loss_fn, metric=metric, metric_hparams={}, - **config - ) - pl_scarf = SCARFLightning(config) - pl_scarf = fit_model(pl_scarf, config) - - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = SCARFDataset(X_valid, is_second_phase=True) - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs) - - preds = trainer.predict(pl_scarf, test_dl) - - preds = torch.concat([out.cpu() for out in preds]).squeeze() - - mse = mean_squared_error(y_valid, preds) - - return mse - - study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" MSE: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) + from benchmark.datasets import load_abalone + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_abalone() + + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) + + from benchmark.pipelines import SCARFPipeLine + + args = get_args() + + pipeline = SCARFPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + + pipeline.benchmark() if __name__ == "__main__": test_scarf_classification() diff --git a/test/test_subtab.py b/test/test_subtab.py index 077c959..f2aa326 100644 --- a/test/test_subtab.py +++ b/test/test_subtab.py @@ -1,484 +1,40 @@ - -def test_subtab_classification(): - from ts3l.pl_modules import SubTabLightning - from ts3l.utils.subtab_utils import SubTabDataset, SubTabCollateFN - from ts3l.utils import TS3LDataModule - - import torch.nn as nn - - import sys - sys.path.append('.') - - from diabetes import load_diabetes - - data, label, continuous_cols, category_cols = load_diabetes() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = "CrossEntropyLoss" - metric = "accuracy_score" - metric_params = {} - random_seed = 0 - - - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed, stratify=label) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed, stratify=y_train) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = SubTabDataset(X_train, unlabeled_data=X_unlabeled) - test_ds = SubTabDataset(X_valid) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size, train_sampler='random', train_collate_fn=SubTabCollateFN(config), valid_collate_fn=SubTabCollateFN(config), n_jobs = n_jobs) - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = SubTabLightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - train_ds = SubTabDataset(X_train, y_train.values) - test_ds = SubTabDataset(X_valid, y_valid.values) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="weighted", train_collate_fn=SubTabCollateFN(config), valid_collate_fn=SubTabCollateFN(config)) - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] +from misc import get_args - checkpoint_path = None +import sys +import os +here = os.path.dirname(__file__) - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) +sys.path.append(os.path.join(here, '..')) - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = SubTabLightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 4, 1024]], +def test_subtab_classification(): - 'tau' : ["suggest_float", ["tau", 0.05, 0.15]], - "use_cosine_similarity" : ["suggest_categorical", ["use_cosine_similarity", [True, False]]], - "use_contrastive" : ["suggest_categorical", ["use_contrastive", [True, False]]], - "use_distance" : ["suggest_categorical", ["use_distance", [True, False]]], + from benchmark.datasets import load_diabetes + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_diabetes() - "n_subsets" : ["suggest_int", ["n_subsets", 2, 7]], - "overlap_ratio" : ["suggest_float", ["overlap_ratio", 0., 1]], + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) - "mask_ratio" : ["suggest_float", ["mask_ratio", 0.1, 0.3]], - "noise_level" : ["suggest_float", ["noise_level", 0.5, 2]], - "noise_type" : ["suggest_categorical", ["noise_type", ["Swap", "Gaussian", "Zero_Out"]]], - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - import torch.nn.functional as F - from sklearn.metrics import accuracy_score - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - - config = { - "input_dim" : data.shape[1], - "output_dim" : 2, - 'hidden_dim' : None, - "tau" : None, - "use_cosine_similarity" : None, - "use_contrastive" : None, - "use_distance" : None, - "n_subsets" : None, - "overlap_ratio" : None, - "mask_ratio" : None, - "noise_type" : None, - "noise_level" : None, - } - - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - - from ts3l.utils.subtab_utils import SubTabConfig - config = SubTabConfig( - task="classification", - loss_fn=loss_fn, metric=metric, metric_hparams={}, - **config - ) - pl_subtab = SubTabLightning(config) - - pl_subtab = fit_model(pl_subtab, config) - pl_subtab.set_second_phase() - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = SubTabDataset(X_valid) - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs, collate_fn=SubTabCollateFN(config)) - - preds = trainer.predict(pl_subtab, test_dl) - - preds = F.softmax(torch.concat([out.cpu() for out in preds]).squeeze(),dim=1) - - accuracy = accuracy_score(y_valid, preds.argmax(1)) - - return accuracy - - study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" Accuracy: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) + from benchmark.pipelines import SubTabPipeLine + + args = get_args() + pipeline = SubTabPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + + pipeline.benchmark() def test_subtab_regression(): - from ts3l.pl_modules import SubTabLightning - from ts3l.utils.subtab_utils import SubTabDataset, SubTabCollateFN - from ts3l.utils import TS3LDataModule - - import torch.nn as nn - - import sys - sys.path.append('.') - from abalone import load_abalone - - data, label, continuous_cols, category_cols = load_abalone() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = "MSELoss" - metric = "mean_squared_error" - random_seed = 0 - - - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = SubTabDataset(X_train, unlabeled_data=X_unlabeled) - test_ds = SubTabDataset(X_valid) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size, train_sampler='random', train_collate_fn=SubTabCollateFN(config), valid_collate_fn=SubTabCollateFN(config), n_jobs = n_jobs) - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = SubTabLightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - train_ds = SubTabDataset(X_train, y_train.values, is_regression=True) - test_ds = SubTabDataset(X_valid, y_valid.values, is_regression=True) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="random", train_collate_fn=SubTabCollateFN(config), valid_collate_fn=SubTabCollateFN(config)) - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] - - checkpoint_path = None - - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = SubTabLightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 4, 1024]], + from benchmark.datasets import load_abalone + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_abalone() - 'tau' : ["suggest_float", ["tau", 0.05, 0.15]], - "use_cosine_similarity" : ["suggest_categorical", ["use_cosine_similarity", [True, False]]], - "use_contrastive" : ["suggest_categorical", ["use_contrastive", [True, False]]], - "use_distance" : ["suggest_categorical", ["use_distance", [True, False]]], + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) - "n_subsets" : ["suggest_int", ["n_subsets", 2, 7]], - "overlap_ratio" : ["suggest_float", ["overlap_ratio", 0., 1]], + from benchmark.pipelines import SubTabPipeLine - "mask_ratio" : ["suggest_float", ["mask_ratio", 0.1, 0.3]], - "noise_level" : ["suggest_float", ["noise_level", 0.5, 2]], - "noise_type" : ["suggest_categorical", ["noise_type", ["Swap", "Gaussian", "Zero_Out"]]], - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - from sklearn.metrics import mean_squared_error - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - - config = { - "input_dim" : data.shape[1], - "output_dim" : 1, - 'hidden_dim' : None, - "tau" : None, - "use_cosine_similarity" : None, - "use_contrastive" : None, - "use_distance" : None, - "n_subsets" : None, - "overlap_ratio" : None, - "mask_ratio" : None, - "noise_type" : None, - "noise_level" : None, - } - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - from ts3l.utils.subtab_utils import SubTabConfig - config = SubTabConfig( - task="regression", - loss_fn=loss_fn, metric=metric, metric_hparams={}, - **config - ) - pl_subtab = SubTabLightning(config) - - # pl_subtab = SubTabLightning( - # model_hparams, - # "Adam", optim_hparams, None, scheduler_hparams, - # loss_fn, - # {}, - # MSEScorer("mean_squared_error"), - # random_seed) - - pl_subtab = fit_model(pl_subtab, config) - pl_subtab.set_second_phase() - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = SubTabDataset(X_valid) - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs, collate_fn=SubTabCollateFN(config)) - - preds = trainer.predict(pl_subtab, test_dl) - - preds = torch.concat([out.cpu() for out in preds]).squeeze() - - mse = mean_squared_error(y_valid, preds) - - return mse - - study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" MSE: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) + args = get_args() + + pipeline = SubTabPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + + pipeline.benchmark() if __name__ == "__main__": test_subtab_classification() diff --git a/test/test_vime.py b/test/test_vime.py index 0e59a83..492c390 100644 --- a/test/test_vime.py +++ b/test/test_vime.py @@ -1,472 +1,40 @@ +from misc import get_args -def test_vime_classification(): - from ts3l.pl_modules import VIMELightning - from ts3l.utils.vime_utils import VIMESemiSLCollateFN - from ts3l.utils.vime_utils import VIMEDataset - from ts3l.utils import TS3LDataModule +import sys +import os +here = os.path.dirname(__file__) - import torch.nn as nn +sys.path.append(os.path.join(here, '..')) - import sys - sys.path.append('.') - - from diabetes import load_diabetes +def test_vime_classification(): - data, label, continuous_cols, category_cols = load_diabetes() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = "CrossEntropyLoss" - metric = "accuracy_score" - random_seed = 0 + from benchmark.datasets import load_diabetes + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_diabetes() - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed, stratify=label) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed, stratify=y_train) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = VIMEDataset(X = X_train, unlabeled_data = X_unlabeled, config=config, continuous_cols = continuous_cols, category_cols = category_cols) - test_ds = VIMEDataset(X = X_valid, config=config, continuous_cols = continuous_cols, category_cols = category_cols) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size, train_sampler='random', n_jobs = n_jobs) - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = VIMELightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - train_ds = VIMEDataset(X_train, y_train.values, config, unlabeled_data=X_unlabeled, continuous_cols=continuous_cols, category_cols=category_cols, is_second_phase=True) - test_ds = VIMEDataset(X_valid, y_valid.values, config, continuous_cols=continuous_cols, category_cols=category_cols, is_second_phase=True) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="weighted", train_collate_fn=VIMESemiSLCollateFN()) - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] - - checkpoint_path = None - - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = VIMELightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 16, 512]], + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) - 'p_m' : ["suggest_float", ["p_m", 0.1, 0.9]], - 'alpha1' : ["suggest_float", ["alpha1", 0.1, 5]], - 'alpha2' : ["suggest_float", ["alpha2", 0.1, 5]], - 'beta' : ["suggest_float", ["beta", 0.1, 10]], - 'K' : ["suggest_int", ["K", 2, 20]], - - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - import torch.nn.functional as F - from sklearn.metrics import accuracy_score - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - - config = { - "input_dim" : data.shape[1], - "hidden_dim" : None, - "output_dim" : 2, - 'alpha1' : None, - 'alpha2' : None, - 'beta' : None, - 'K' : None, - "num_categoricals" : num_categoricals, - "num_continuous": num_continuous, - "u_label" : -1, - "p_m" : None, - } - - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - - from ts3l.utils.vime_utils import VIMEConfig - config = VIMEConfig( - task="classification", - loss_fn="CrossEntropyLoss", metric=metric, metric_hparams={}, - input_dim=config["input_dim"], hidden_dim=config["hidden_dim"], - output_dim=config["output_dim"], - alpha1=config["alpha1"], alpha2=config["alpha2"], beta=config["beta"], K=config["K"], p_m=config["p_m"], - num_categoricals=num_categoricals, num_continuous=num_continuous - ) - pl_vime = VIMELightning(config) - - pl_vime = fit_model(pl_vime, config) - pl_vime.set_second_phase() - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = VIMEDataset(X_valid, category_cols=category_cols, continuous_cols=continuous_cols, is_second_phase=True) - - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs) - - preds = trainer.predict(pl_vime, test_dl) - - preds = F.softmax(torch.concat([out.cpu() for out in preds]).squeeze(),dim=1) - - accuracy = accuracy_score(y_valid, preds.argmax(1)) - - return accuracy - - study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" Accuracy: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) + from benchmark.pipelines import VIMEPipeLine + + args = get_args() + + pipeline = VIMEPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + pipeline.benchmark() def test_vime_regression(): - from ts3l.pl_modules import VIMELightning - from ts3l.utils.vime_utils import VIMESemiSLCollateFN - from ts3l.utils.vime_utils import VIMEDataset - from ts3l.utils import TS3LDataModule - import torch.nn as nn - - import sys - sys.path.append('.') + from benchmark.datasets import load_abalone + data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams = load_abalone() - from abalone import load_abalone - - data, label, continuous_cols, category_cols = load_abalone() - num_categoricals = len(category_cols) - num_continuous = len(continuous_cols) - loss_fn = nn.MSELoss - metric = "mean_squared_error" - random_seed = 0 - - - from sklearn.model_selection import train_test_split - - X_train, X_valid, y_train, y_valid = train_test_split(data, label, train_size = 0.7, random_state=random_seed) - - X_train, X_unlabeled, y_train, _ = train_test_split(X_train, y_train, train_size = 0.1, random_state=random_seed) - - - from pytorch_lightning import Trainer - from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint - import pandas as pd - - accelerator = 'cpu' - n_jobs = 4 - max_epochs = 3 - batch_size = 128 - - pretraining_patience = 3 - early_stopping_patience = 3 - - batch_size = 64 - - def fit_model( - model, - config - ): - - train_ds = VIMEDataset(X = X_train, unlabeled_data = X_unlabeled, config=config, continuous_cols = continuous_cols, category_cols = category_cols) - test_ds = VIMEDataset(X = X_valid, config=config, continuous_cols = continuous_cols, category_cols = category_cols) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size, train_sampler='random', n_jobs = n_jobs) - - model.set_first_phase() - - callbacks = [ - EarlyStopping( - monitor= 'val_loss', - mode = 'min', - patience = pretraining_patience, - verbose = False - ) - ] - pretraining_path = f'temporary_ckpt_data/pretraining' - checkpoint_callback = ModelCheckpoint( - monitor='val_loss', - dirpath=pretraining_path, - filename='pretraining-{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'min' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - pretraining_path = checkpoint_callback.best_model_path - - model = VIMELightning.load_from_checkpoint(pretraining_path) - - model.set_second_phase() - - train_ds = VIMEDataset(X_train, y_train.values, config, unlabeled_data=X_unlabeled, continuous_cols=continuous_cols, category_cols=category_cols, is_second_phase=True, is_regression=True) - test_ds = VIMEDataset(X_valid, y_valid.values, config, continuous_cols=continuous_cols, category_cols=category_cols, is_second_phase=True, is_regression=True) - - pl_datamodule = TS3LDataModule(train_ds, test_ds, batch_size = batch_size, train_sampler="weighted", train_collate_fn=VIMESemiSLCollateFN()) - - callbacks = [ - EarlyStopping( - monitor= 'val_' + metric, - mode = 'max', - patience = early_stopping_patience, - verbose = False - ) - ] - - checkpoint_path = None - - checkpoint_path = f'temporary_ckpt_data/' - checkpoint_callback = ModelCheckpoint( - monitor='val_' + metric, - dirpath=checkpoint_path, - filename='{epoch:02d}-{val_f1:.4f}', - save_top_k=1, - mode = 'max' - ) - - callbacks.append(checkpoint_callback) - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = callbacks, - ) - - trainer.fit(model, pl_datamodule) - - model = VIMELightning.load_from_checkpoint(checkpoint_callback.best_model_path) - model.set_second_phase() - - return model - - hparams_range = { - - 'hidden_dim' : ['suggest_int', ['hidden_dim', 16, 512]], + sys.path.append(os.path.join(os.path.join(here, '..'), "benchmark")) - 'p_m' : ["suggest_float", ["p_m", 0.1, 0.9]], - 'alpha1' : ["suggest_float", ["alpha1", 0.1, 5]], - 'alpha2' : ["suggest_float", ["alpha2", 0.1, 5]], - 'beta' : ["suggest_float", ["beta", 0.1, 10]], - 'K' : ["suggest_int", ["K", 2, 20]], - - - 'lr' : ['suggest_float', ['lr', 0.0001, 0.05]], - } - - import optuna - from sklearn.metrics import mean_squared_error - - def objective( trial: optuna.trial.Trial, - ) -> float: - """Objective function for optuna - - Args: - trial: A object which returns hyperparameters of a model of hyperparameter search trial. - train_idx: Indices of training data in self.data and self.label. - test_idx: Indices of test data in self.data and self.label. - fold_idx: A fold index that denotes which fold under the given k-fold cross validation. - - Returns: - A score of given hyperparameters. - """ - - config = { - "input_dim" : data.shape[1], - "hidden_dim" : None, - "output_dim" : 1, - 'alpha1' : None, - 'alpha2' : None, - 'beta' : None, - 'K' : None, - "num_categoricals" : num_categoricals, - "num_continuous": num_continuous, - "u_label" : -1, - "p_m" : None, - } - - optim_hparams = { - "lr" : None - } - scheduler_hparams = { - } - - for k, v in hparams_range.items(): - if k in config.keys(): - config[k] = getattr(trial, v[0])(*v[1]) - if k in optim_hparams.keys(): - optim_hparams[k] = getattr(trial, v[0])(*v[1]) - if k in scheduler_hparams.keys(): - scheduler_hparams[k] = getattr(trial, v[0])(*v[1]) - from ts3l.utils.vime_utils import VIMEConfig - config = VIMEConfig( - task="regression", - loss_fn="MSELoss", metric=metric, metric_hparams={}, - input_dim=config["input_dim"], hidden_dim=config["hidden_dim"], - output_dim=config["output_dim"], - alpha1=config["alpha1"], alpha2=config["alpha2"], beta=config["beta"], K=config["K"], p_m = config["p_m"], - num_categoricals=num_categoricals, num_continuous=num_continuous - ) - pl_vime = VIMELightning(config) - - pl_vime = fit_model(pl_vime, config) - pl_vime.set_second_phase() - - trainer = Trainer( - accelerator = accelerator, - max_epochs = max_epochs, - num_sanity_val_steps = 2, - callbacks = None, - ) - - test_ds = VIMEDataset(X_valid, category_cols=category_cols, continuous_cols=continuous_cols, is_second_phase=True, is_regression=True) - from torch.utils.data import SequentialSampler, DataLoader - import torch - test_dl = DataLoader(test_ds, batch_size, shuffle=False, sampler = SequentialSampler(test_ds), num_workers=n_jobs) - - preds = trainer.predict(pl_vime, test_dl) - - preds = torch.concat([out.cpu() for out in preds]).squeeze() - - mse = mean_squared_error(y_valid, preds) - - return mse - - study = optuna.create_study(direction="minimize",sampler=optuna.samplers.TPESampler(seed=random_seed)) - study.optimize(objective, n_trials=2, show_progress_bar=False) - - print("Number of finished trials: ", len(study.trials)) - print("Best trial:") - - - trial = study.best_trial - - print(" MSE: {}".format(trial.value)) - print(" Best hyperparameters: ", trial) + from benchmark.pipelines import VIMEPipeLine + + args = get_args() + + pipeline = VIMEPipeLine(args, data, label, continuous_cols, category_cols, output_dim, metric, metric_hparams) + + pipeline.benchmark() if __name__ == "__main__": test_vime_classification()