Commit 94e977db authored by Orel Gueta's avatar Orel Gueta
Browse files

Add option to compare various regressors with different features.

parent 381aceb3
......@@ -16,20 +16,40 @@ if __name__ == '__main__':
labels, train_features = event_classes.nominal_labels_train_features()
#FIXME
vars_to_remove = ['meanPedvar_Image', 'av_cross', 'av_crossO']
for this_var in vars_to_remove:
train_features.remove(this_var)
# models_to_compare = [
# # 'linear_regression',
# # 'random_forest',
# # 'MLP',
# # 'MLP_relu',
# # 'MLP_logistic',
# # 'MLP_uniform',
# # 'MLP_small',
# # 'MLP_lbfgs',
# # 'BDT',
# # 'ridge',
# # 'SVR',
# # 'linear_SVR',
# # 'SGD',
# # 'MLP_small_less_vars',
# 'MLP_meanPedvar_av_cross_O',
# ]
models_to_compare = [
'linear_regression',
'random_forest',
'MLP',
'MLP_relu',
'MLP_logistic',
'MLP_uniform',
'MLP_small',
# 'MLP_lbfgs',
# 'BDT',
'ridge',
# 'SVR',
'linear_SVR',
# 'SGD',
'MLP_loss_sum',
'MLP_NTrig',
'MLP_meanPedvar_Image',
'MLP_av_fui',
'MLP_av_cross',
'MLP_av_crossO',
'MLP_av_R',
'MLP_av_ES',
'MLP_MWR',
'MLP_MLR',
]
trained_models = event_classes.load_models(models_to_compare)
......@@ -42,14 +62,14 @@ if __name__ == '__main__':
dtf_e_test,
this_trained_model,
this_trained_model_name,
train_features,
labels
# train_features,
# labels
)
plt.savefig('plots/{}.pdf'.format(this_trained_model_name))
plt.clf()
plt = event_classes.plot_score_comparison(dtf_e_test, trained_models, train_features, labels)
plt = event_classes.plot_score_comparison(dtf_e_test, trained_models)
plt.savefig('plots/compare_scores.pdf')
plt.clf()
......@@ -570,7 +570,7 @@ def define_regressors():
return regressors
def train_models(dtf_e_train, train_features, labels, regressors):
def train_models(dtf_e_train, regressors):
'''
Train all the models in regressors, using the data in dtf_e_train.
The models are trained per energy range in dtf_e_train.
......@@ -581,21 +581,26 @@ def train_models(dtf_e_train, train_features, labels, regressors):
Each entry in the dict is a DataFrame containing the data to train with.
The keys of the dict are the energy ranges of the data.
Each DataFrame is assumed to contain all 'train_features' and 'labels'.
train_features: list
List of variable names to train with.
labels: str
Name of the variable used as the labels in the training.
regressors: dict of sklearn regressors
A dictionary of regressors to train as returned from define_regressors().
regressors: a nested dict of regressors:
1st dict:
keys=model names, values=2nd dict
2nd dict:
'model':dict of sklearn regressors (as returned from define_regressors());
'train_features': list of variable names to train with.
'labels': Name of the variable used as the labels in the training.
Returns
-------
A nested dictionary trained models:
A nested dictionary trained models, train_features and labels:
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
keys=energy ranges, values 3rd dict
3rd dict:
'model': traine model for this energy range
'train_features': list of variable names to train with.
'labels': Name of the variable used as the labels in the training.
'''
models = dict()
......@@ -604,11 +609,14 @@ def train_models(dtf_e_train, train_features, labels, regressors):
for this_e_range in dtf_e_train.keys():
print('Training {} in the energy range - {}'.format(this_model, this_e_range))
X_train = dtf_e_train[this_e_range][train_features].values
y_train = dtf_e_train[this_e_range][labels].values
models[this_model][this_e_range] = copy.deepcopy(
this_regressor.fit(X_train, y_train)
X_train = dtf_e_train[this_e_range][this_regressor['train_features']].values
y_train = dtf_e_train[this_e_range][this_regressor['labels']].values
models[this_model][this_e_range] = dict()
models[this_model][this_e_range]['train_features'] = this_regressor['train_features']
models[this_model][this_e_range]['labels'] = this_regressor['labels']
models[this_model][this_e_range]['model'] = copy.deepcopy(
this_regressor['model'].fit(X_train, y_train)
)
return models
......@@ -623,10 +631,14 @@ def save_models(trained_models):
Parameters
----------
trained_models: a nested dict of trained sklearn regressor per energy range.
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values 3rd dict
3rd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
'''
for regressor_name, this_regressor in trained_models.items():
......@@ -697,15 +709,18 @@ def load_models(regressor_names=list()):
----------
regressor_names: list of str
A list of regressor names to load from disk
# TODO: take the default list from define_regressors()?
Returns
-------
trained_models: a nested dict of trained sklearn regressor per energy range.
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values 3rd dict
3rd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
'''
trained_models = defaultdict(dict)
......@@ -759,7 +774,7 @@ def plot_pearson_correlation(dtf, title):
return plt
def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name, train_features, labels):
def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name):
'''
Plot true values vs. the predictions of the model for all energy bins.
......@@ -769,15 +784,15 @@ def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name, train_f
Each entry in the dict is a DataFrame containing the data to test with.
The keys of the dict are the energy ranges of the data.
Each DataFrame is assumed to contain all 'train_features' and 'labels'.
trained_models: dict of a trained sklearn regressor per energy range
(keys=energy ranges, values=trained models).
trained_models: a nested dict of one trained sklearn regressor per energy range.
1st dict:
keys=energy ranges, values 2nd dict
2nd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
trained_model_name: str
Name of the regressor trained.
train_features: list
List of variable names trained with.
labels: str
Name of the variable used as the labels in the training.
Returns
-------
......@@ -791,10 +806,10 @@ def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name, train_f
for i_plot, (this_e_range, this_model) in enumerate(trained_models.items()):
X_test = dtf_e_test[this_e_range][train_features].values
y_test = dtf_e_test[this_e_range][labels].values
X_test = dtf_e_test[this_e_range][this_model['train_features']].values
y_test = dtf_e_test[this_e_range][this_model['labels']].values
y_pred = this_model.predict(X_test)
y_pred = this_model['model'].predict(X_test)
ax = axs[int(np.floor((i_plot)/ncols)), (i_plot) % 4]
......@@ -882,9 +897,10 @@ def plot_matrix(dtf, train_features, labels, n_types=2):
return grid_plots
def plot_score_comparison(dtf_e_test, trained_models, train_features, labels):
def plot_score_comparison(dtf_e_test, trained_models):
'''
Plot the score of the model as a function of energy.
#TODO add a similar function that plots from saved scores instead of calculating every time.
Parameters
----------
......@@ -896,12 +912,11 @@ def plot_score_comparison(dtf_e_test, trained_models, train_features, labels):
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
train_features: list
List of variable names trained with.
labels: str
Name of the variable used as the labels in the training.
keys=energy ranges, values 3rd dict
3rd dict:
'model': dict of trained models for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
Returns
-------
......@@ -922,12 +937,12 @@ def plot_score_comparison(dtf_e_test, trained_models, train_features, labels):
for this_e_range, this_model in trained_model.items():
X_test = dtf_e_test[this_e_range][train_features].values
y_test = dtf_e_test[this_e_range][labels].values
X_test = dtf_e_test[this_e_range][this_model['train_features']].values
y_test = dtf_e_test[this_e_range][this_model['labels']].values
y_pred = this_model.predict(X_test)
y_pred = this_model['model'].predict(X_test)
scores[this_regressor_name].append(this_model.score(X_test, y_test))
scores[this_regressor_name].append(this_model['model'].score(X_test, y_test))
# rms_scores[this_regressor_name].append(metrics.mean_squared_error(y_test, y_pred))
ax.plot(energy_bins, scores[this_regressor_name], label=this_regressor_name)
......
import argparse
from pathlib import Path
import copy
import event_classes
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=(
'Train event classes model with various training features.'
'Results are saved in the models directory.'
)
)
args = parser.parse_args()
# dl2_file_name = '/lustre/fs21/group/cta/users/maierg/analysis/AnalysisData/uploadDL2/Paranal_20deg/gamma_cone.S.3HB9-FD_ID0.eff-0.root'
dl2_file_name = '/lustre/fs21/group/cta/users/maierg/analysis/AnalysisData/uploadDL2/Paranal_20deg/gamma_onSource.S.3HB9-FD_ID0.eff-0.root'
dtf = event_classes.extract_df_from_dl2(dl2_file_name)
dtf_e = event_classes.bin_data_in_energy(dtf)
dtf_e_train, dtf_e_test = event_classes.split_data_train_test(dtf_e)
labels, train_features = event_classes.nominal_labels_train_features()
all_models = event_classes.define_regressors()
vars_to_remove = [
'loss_sum',
'NTrig',
'meanPedvar_Image',
'av_fui',
'av_cross',
'av_crossO',
'av_R',
'av_ES',
'MWR',
'MLR',
]
models_to_train = dict()
for this_var in vars_to_remove:
_vars = copy.copy(train_features)
_vars.remove(this_var)
model_name = 'MLP_{}'.format(this_var)
models_to_train[model_name] = dict()
models_to_train[model_name]['train_features'] = _vars
models_to_train[model_name]['labels'] = labels
models_to_train[model_name]['model'] = all_models['MLP_small']
trained_models = event_classes.train_models(
dtf_e_train,
models_to_train
)
event_classes.save_models(trained_models)
event_classes.save_test_dtf(dtf_e_test)
......@@ -25,7 +25,7 @@ if __name__ == '__main__':
all_models = event_classes.define_regressors()
models_to_train = {
# 'linear_regression': all_models['linear_regression'],
'random_forest': all_models['random_forest'], # Do not use, performs bad and takes lots of disk space
# 'random_forest': all_models['random_forest'], # Do not use, performs bad and takes lots of disk space
# 'MLP': all_models['MLP'],
# 'MLP_relu': all_models['MLP_relu'],
# 'MLP_logistic': all_models['MLP_logistic'],
......@@ -38,24 +38,14 @@ if __name__ == '__main__':
# 'linear_SVR': all_models['linear_SVR'],
# 'SGD': all_models['SGD'],
}
for this_model in models_to_train.values():
this_model['train_features'] = train_features
this_model['labels'] = labels
trained_models = event_classes.train_models(
dtf_e_train,
train_features,
labels,
models_to_train
)
event_classes.save_models(trained_models)
event_classes.save_test_dtf(dtf_e_test)
# Path('plots').mkdir(parents=True, exist_ok=True)
# for this_trained_model_name, this_trained_model in trained_models.items():
# plt = event_classes.plot_test_vs_predict(
# dtf_e_test,
# this_trained_model,
# this_trained_model_name,
# train_features,
# labels
# )
# plt.savefig('plots/{}.pdf'.format(this_trained_model_name))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment