Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Tarek Hassan
iact_event_types
Commits
94e977db
Commit
94e977db
authored
Jan 08, 2021
by
Orel Gueta
Browse files
Add option to compare various regressors with different features.
parent
381aceb3
Changes
4
Hide whitespace changes
Inline
Side-by-side
compare_models.py
View file @
94e977db
...
...
@@ -16,20 +16,40 @@ if __name__ == '__main__':
labels
,
train_features
=
event_classes
.
nominal_labels_train_features
()
#FIXME
vars_to_remove
=
[
'meanPedvar_Image'
,
'av_cross'
,
'av_crossO'
]
for
this_var
in
vars_to_remove
:
train_features
.
remove
(
this_var
)
# models_to_compare = [
# # 'linear_regression',
# # 'random_forest',
# # 'MLP',
# # 'MLP_relu',
# # 'MLP_logistic',
# # 'MLP_uniform',
# # 'MLP_small',
# # 'MLP_lbfgs',
# # 'BDT',
# # 'ridge',
# # 'SVR',
# # 'linear_SVR',
# # 'SGD',
# # 'MLP_small_less_vars',
# 'MLP_meanPedvar_av_cross_O',
# ]
models_to_compare
=
[
'linear_regression'
,
'random_forest'
,
'MLP'
,
'MLP_relu'
,
'MLP_logistic'
,
'MLP_uniform'
,
'MLP_small'
,
# 'MLP_lbfgs',
# 'BDT',
'ridge'
,
# 'SVR',
'linear_SVR'
,
# 'SGD',
'MLP_loss_sum'
,
'MLP_NTrig'
,
'MLP_meanPedvar_Image'
,
'MLP_av_fui'
,
'MLP_av_cross'
,
'MLP_av_crossO'
,
'MLP_av_R'
,
'MLP_av_ES'
,
'MLP_MWR'
,
'MLP_MLR'
,
]
trained_models
=
event_classes
.
load_models
(
models_to_compare
)
...
...
@@ -42,14 +62,14 @@ if __name__ == '__main__':
dtf_e_test
,
this_trained_model
,
this_trained_model_name
,
train_features
,
labels
#
train_features,
#
labels
)
plt
.
savefig
(
'plots/{}.pdf'
.
format
(
this_trained_model_name
))
plt
.
clf
()
plt
=
event_classes
.
plot_score_comparison
(
dtf_e_test
,
trained_models
,
train_features
,
labels
)
plt
=
event_classes
.
plot_score_comparison
(
dtf_e_test
,
trained_models
)
plt
.
savefig
(
'plots/compare_scores.pdf'
)
plt
.
clf
()
event_classes.py
View file @
94e977db
...
...
@@ -570,7 +570,7 @@ def define_regressors():
return
regressors
def
train_models
(
dtf_e_train
,
train_features
,
labels
,
regressors
):
def
train_models
(
dtf_e_train
,
regressors
):
'''
Train all the models in regressors, using the data in dtf_e_train.
The models are trained per energy range in dtf_e_train.
...
...
@@ -581,21 +581,26 @@ def train_models(dtf_e_train, train_features, labels, regressors):
Each entry in the dict is a DataFrame containing the data to train with.
The keys of the dict are the energy ranges of the data.
Each DataFrame is assumed to contain all 'train_features' and 'labels'.
train_features: list
List of variable names to train with.
labels: str
Name of the variable used as the labels in the training.
regressors: dict of sklearn regressors
A dictionary of regressors to train as returned from define_regressors().
regressors: a nested dict of regressors:
1st dict:
keys=model names, values=2nd dict
2nd dict:
'model':dict of sklearn regressors (as returned from define_regressors());
'train_features': list of variable names to train with.
'labels': Name of the variable used as the labels in the training.
Returns
-------
A nested dictionary trained models:
A nested dictionary trained models
, train_features and labels
:
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
keys=energy ranges, values 3rd dict
3rd dict:
'model': traine model for this energy range
'train_features': list of variable names to train with.
'labels': Name of the variable used as the labels in the training.
'''
models
=
dict
()
...
...
@@ -604,11 +609,14 @@ def train_models(dtf_e_train, train_features, labels, regressors):
for
this_e_range
in
dtf_e_train
.
keys
():
print
(
'Training {} in the energy range - {}'
.
format
(
this_model
,
this_e_range
))
X_train
=
dtf_e_train
[
this_e_range
][
train_features
].
values
y_train
=
dtf_e_train
[
this_e_range
][
labels
].
values
models
[
this_model
][
this_e_range
]
=
copy
.
deepcopy
(
this_regressor
.
fit
(
X_train
,
y_train
)
X_train
=
dtf_e_train
[
this_e_range
][
this_regressor
[
'train_features'
]].
values
y_train
=
dtf_e_train
[
this_e_range
][
this_regressor
[
'labels'
]].
values
models
[
this_model
][
this_e_range
]
=
dict
()
models
[
this_model
][
this_e_range
][
'train_features'
]
=
this_regressor
[
'train_features'
]
models
[
this_model
][
this_e_range
][
'labels'
]
=
this_regressor
[
'labels'
]
models
[
this_model
][
this_e_range
][
'model'
]
=
copy
.
deepcopy
(
this_regressor
[
'model'
].
fit
(
X_train
,
y_train
)
)
return
models
...
...
@@ -623,10 +631,14 @@ def save_models(trained_models):
Parameters
----------
trained_models: a nested dict of trained sklearn regressor per energy range.
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values 3rd dict
3rd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
'''
for
regressor_name
,
this_regressor
in
trained_models
.
items
():
...
...
@@ -697,15 +709,18 @@ def load_models(regressor_names=list()):
----------
regressor_names: list of str
A list of regressor names to load from disk
# TODO: take the default list from define_regressors()?
Returns
-------
trained_models: a nested dict of trained sklearn regressor per energy range.
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values 3rd dict
3rd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
'''
trained_models
=
defaultdict
(
dict
)
...
...
@@ -759,7 +774,7 @@ def plot_pearson_correlation(dtf, title):
return
plt
def
plot_test_vs_predict
(
dtf_e_test
,
trained_models
,
trained_model_name
,
train_features
,
labels
):
def
plot_test_vs_predict
(
dtf_e_test
,
trained_models
,
trained_model_name
):
'''
Plot true values vs. the predictions of the model for all energy bins.
...
...
@@ -769,15 +784,15 @@ def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name, train_f
Each entry in the dict is a DataFrame containing the data to test with.
The keys of the dict are the energy ranges of the data.
Each DataFrame is assumed to contain all 'train_features' and 'labels'.
trained_models: dict of a trained sklearn regressor per energy range
(keys=energy ranges, values=trained models).
trained_models: a nested dict of one trained sklearn regressor per energy range.
1st dict:
keys=energy ranges, values 2nd dict
2nd dict:
'model': trained model for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
trained_model_name: str
Name of the regressor trained.
train_features: list
List of variable names trained with.
labels: str
Name of the variable used as the labels in the training.
Returns
-------
...
...
@@ -791,10 +806,10 @@ def plot_test_vs_predict(dtf_e_test, trained_models, trained_model_name, train_f
for
i_plot
,
(
this_e_range
,
this_model
)
in
enumerate
(
trained_models
.
items
()):
X_test
=
dtf_e_test
[
this_e_range
][
train_features
].
values
y_test
=
dtf_e_test
[
this_e_range
][
labels
].
values
X_test
=
dtf_e_test
[
this_e_range
][
this_model
[
'
train_features
'
]
].
values
y_test
=
dtf_e_test
[
this_e_range
][
this_model
[
'
labels
'
]
].
values
y_pred
=
this_model
.
predict
(
X_test
)
y_pred
=
this_model
[
'model'
]
.
predict
(
X_test
)
ax
=
axs
[
int
(
np
.
floor
((
i_plot
)
/
ncols
)),
(
i_plot
)
%
4
]
...
...
@@ -882,9 +897,10 @@ def plot_matrix(dtf, train_features, labels, n_types=2):
return
grid_plots
def
plot_score_comparison
(
dtf_e_test
,
trained_models
,
train_features
,
labels
):
def
plot_score_comparison
(
dtf_e_test
,
trained_models
):
'''
Plot the score of the model as a function of energy.
#TODO add a similar function that plots from saved scores instead of calculating every time.
Parameters
----------
...
...
@@ -896,12 +912,11 @@ def plot_score_comparison(dtf_e_test, trained_models, train_features, labels):
1st dict:
keys=model names, values=2nd dict
2nd dict:
keys=energy ranges, values=trained models
train_features: list
List of variable names trained with.
labels: str
Name of the variable used as the labels in the training.
keys=energy ranges, values 3rd dict
3rd dict:
'model': dict of trained models for this energy range
'train_features': list of variable names trained with.
'labels': Name of the variable used as the labels in the training.
Returns
-------
...
...
@@ -922,12 +937,12 @@ def plot_score_comparison(dtf_e_test, trained_models, train_features, labels):
for
this_e_range
,
this_model
in
trained_model
.
items
():
X_test
=
dtf_e_test
[
this_e_range
][
train_features
].
values
y_test
=
dtf_e_test
[
this_e_range
][
labels
].
values
X_test
=
dtf_e_test
[
this_e_range
][
this_model
[
'
train_features
'
]
].
values
y_test
=
dtf_e_test
[
this_e_range
][
this_model
[
'
labels
'
]
].
values
y_pred
=
this_model
.
predict
(
X_test
)
y_pred
=
this_model
[
'model'
]
.
predict
(
X_test
)
scores
[
this_regressor_name
].
append
(
this_model
.
score
(
X_test
,
y_test
))
scores
[
this_regressor_name
].
append
(
this_model
[
'model'
]
.
score
(
X_test
,
y_test
))
# rms_scores[this_regressor_name].append(metrics.mean_squared_error(y_test, y_pred))
ax
.
plot
(
energy_bins
,
scores
[
this_regressor_name
],
label
=
this_regressor_name
)
...
...
study_features.py
0 → 100644
View file @
94e977db
import
argparse
from
pathlib
import
Path
import
copy
import
event_classes
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
(
'Train event classes model with various training features.'
'Results are saved in the models directory.'
)
)
args
=
parser
.
parse_args
()
# dl2_file_name = '/lustre/fs21/group/cta/users/maierg/analysis/AnalysisData/uploadDL2/Paranal_20deg/gamma_cone.S.3HB9-FD_ID0.eff-0.root'
dl2_file_name
=
'/lustre/fs21/group/cta/users/maierg/analysis/AnalysisData/uploadDL2/Paranal_20deg/gamma_onSource.S.3HB9-FD_ID0.eff-0.root'
dtf
=
event_classes
.
extract_df_from_dl2
(
dl2_file_name
)
dtf_e
=
event_classes
.
bin_data_in_energy
(
dtf
)
dtf_e_train
,
dtf_e_test
=
event_classes
.
split_data_train_test
(
dtf_e
)
labels
,
train_features
=
event_classes
.
nominal_labels_train_features
()
all_models
=
event_classes
.
define_regressors
()
vars_to_remove
=
[
'loss_sum'
,
'NTrig'
,
'meanPedvar_Image'
,
'av_fui'
,
'av_cross'
,
'av_crossO'
,
'av_R'
,
'av_ES'
,
'MWR'
,
'MLR'
,
]
models_to_train
=
dict
()
for
this_var
in
vars_to_remove
:
_vars
=
copy
.
copy
(
train_features
)
_vars
.
remove
(
this_var
)
model_name
=
'MLP_{}'
.
format
(
this_var
)
models_to_train
[
model_name
]
=
dict
()
models_to_train
[
model_name
][
'train_features'
]
=
_vars
models_to_train
[
model_name
][
'labels'
]
=
labels
models_to_train
[
model_name
][
'model'
]
=
all_models
[
'MLP_small'
]
trained_models
=
event_classes
.
train_models
(
dtf_e_train
,
models_to_train
)
event_classes
.
save_models
(
trained_models
)
event_classes
.
save_test_dtf
(
dtf_e_test
)
train_models.py
View file @
94e977db
...
...
@@ -25,7 +25,7 @@ if __name__ == '__main__':
all_models
=
event_classes
.
define_regressors
()
models_to_train
=
{
# 'linear_regression': all_models['linear_regression'],
'random_forest'
:
all_models
[
'random_forest'
],
# Do not use, performs bad and takes lots of disk space
#
'random_forest': all_models['random_forest'], # Do not use, performs bad and takes lots of disk space
# 'MLP': all_models['MLP'],
# 'MLP_relu': all_models['MLP_relu'],
# 'MLP_logistic': all_models['MLP_logistic'],
...
...
@@ -38,24 +38,14 @@ if __name__ == '__main__':
# 'linear_SVR': all_models['linear_SVR'],
# 'SGD': all_models['SGD'],
}
for
this_model
in
models_to_train
.
values
():
this_model
[
'train_features'
]
=
train_features
this_model
[
'labels'
]
=
labels
trained_models
=
event_classes
.
train_models
(
dtf_e_train
,
train_features
,
labels
,
models_to_train
)
event_classes
.
save_models
(
trained_models
)
event_classes
.
save_test_dtf
(
dtf_e_test
)
# Path('plots').mkdir(parents=True, exist_ok=True)
# for this_trained_model_name, this_trained_model in trained_models.items():
# plt = event_classes.plot_test_vs_predict(
# dtf_e_test,
# this_trained_model,
# this_trained_model_name,
# train_features,
# labels
# )
# plt.savefig('plots/{}.pdf'.format(this_trained_model_name))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment