from IPython.core.interactiveshell import InteractiveShell #display full output instead of just the last one
InteractiveShell.ast_node_interactivity = "all"
import toolz as fp
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cloudpickle
import gc
sns.set()
%matplotlib inline
pd.set_option('display.max_rows', 50)
palette = sns.hls_palette(8, h=.7, s=.9, l=0.5)
sns.palplot(palette[::-1])
sns.set_palette(palette[::-1])
gray = "444444"
plt.rcParams['figure.facecolor'] = '1' # background color
plt.rcParams['axes.facecolor'] = '1' # plot background color
plt.rcParams['grid.color'] = '0.8'
plt.rcParams['grid.alpha'] = .4
plt.rcParams['axes.edgecolor'] = '1'
plt.rcParams['lines.linewidth'] = 1
plt.rcParams['grid.linestyle'] = '-'
plt.rcParams['axes.axisbelow'] = True
plt.rcParams['axes.labelcolor'] = gray
plt.rcParams['text.color'] = gray
plt.rcParams['xtick.color'] = gray
plt.rcParams['ytick.color'] = gray
sns.set_style("whitegrid")
sns.set_context("notebook")
%config InlineBackend.figure_format = "retina"
import pickle
from pathlib import Path
class dotdict(dict):
"""dot.notation access to dictionary attributes"""
__getattr__ = dict.get
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
@fp.curry
def open_pickled_object(file, full_path):
"""
Open a pickled object by specifying the full_path and then the filename
Parameters
----------
file : str
String representing the filename
full_path : int
String representing the path to the file
"""
return pickle.load(open(full_path + "/" + file, "rb"))
def open_experiment(did, model_type="BINARY"):
"""
Open the experiment files from a single dataset by providing the ID
Parameters
----------
did : int
the dataset id
model_type : str
String representing the model type. Currently only BINARY is supported
"""
base_path = "../"+model_type
full_path = base_path + "/" + str(did)
open_object = open_pickled_object(full_path=full_path)
if Path(base_path).is_dir() and Path(full_path).is_dir():
return dotdict(
dict(
analyzer_info=open_object("analyzer_info.pkl"),
df_stats=open_object("df_stats.pkl"),
final_result=open_object("final_result.pkl"),
hp_tree=open_object("hp_tree.pkl"),
openml_object=open_object("openml_object.pkl"),
shape=open_object("shape.pkl"),
)
)
return {}
def get_all_available_experiments(model_type="BINARY"):
"""
Return a list of the datasets ID's available by reading the folder
Parameters
----------
model_type : str
String representing the model type. Currently only BINARY is supported
"""
base_path = "../" + model_type
path = Path(base_path)
return [int(x.name) for x in path.iterdir() if x.is_dir()]
def new_axis(figsize=(15, 8), title=None):
"""
Creates a new matplotlib axis and returns it
Parameters
----------
figsize : Tuple(Numeric, Numeric)
the figure size
title : str
the figure title
"""
_, ax = plt.subplots(figsize=figsize)
if title is not None:
ax.set_title(title, size=17)
return ax
def build_experiment_stats(did_list, model_type="BINARY"):
"""
Return the meta-statistics from the provided datasets ids
Parameters
----------
did_list : List[int]
list of dataset ids
model_type : str
String representing the model type. Currently only BINARY is supported
"""
base_path = "../" + model_type
if not Path(base_path).is_dir():
raise ValueError("Base directory doesn't exist")
stats_df = pd.DataFrame()
open_stats = open_pickled_object(file="df_stats.pkl")
for did in did_list:
full_path = base_path + "/" + str(did)
full_path_posix = Path(base_path + "/" + str(did))
if full_path_posix.is_dir():
stats_df = pd.concat(
(stats_df,
open_stats(full_path=full_path).T.assign(did=did)
)
)
return stats_df
A single experiment containts all the results of changing the hyperparameters, along with the meta-statistics for a given dataset. The original dataset and information is stored in the openml_object
experiment = open_experiment(did=72)
experiment.keys()
experiment.openml_object.name
Important way to select the experiments:
If a given factor is NaN in the dataframe, it means that in that specific experiment run, that parameter was't changed!
I'm analyzing three parameters: max_depth
, num_estimators
and learning_rate
. So if want to get the dataframe of the experiments where the only factor analyzed is the max_depth
for example, I get all rows where max_depth
is not NaN but num_estimators
and learning_rate
are.
def build_result_dataframe(log_list):
"""
Given a log with all experiment results of a given dataset, this function returns the dataset concatenated
Parameters
----------
log_list : List[dict]
a list of experiment logs
"""
all_experiments = []
for base_result in log_list:
train_key, test_key = "train_result", "test_result"
base_df = pd.DataFrame(fp.dissoc(base_result, train_key, test_key), index=range(2))
metrics_df = pd.concat((pd.DataFrame(base_result[train_key], index=[0]).assign(type="train"),
pd.DataFrame(base_result[test_key], index=[0]).assign(type="test"))).reset_index(drop=True)
all_experiments.append(pd.concat((base_df, metrics_df), axis=1))
return pd.concat((exp for exp in all_experiments), sort=False, ignore_index=True)
did_3_results = build_result_dataframe(experiment.final_result)
did_3_results.shape
did_3_results.head()
We can look at each dataset to check the results of different parameters.
TIP: Using Panda's .query()
method, we can use col == col
to filter when col
isn't NaN, and col != col
to filter when col
IS NaN.
_, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 10))
axes = axes.ravel()
res_num_est = did_3_results.query(
"(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
).assign(num_estimators=lambda df: df.num_estimators.astype(int))
for ax_index, metric in enumerate(
("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")
):
sns.pointplot(x="num_estimators", y=metric, hue="type", data=res_num_est, ax=axes[ax_index])
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
# ax.set_title("Num_estimators impact on AUC", size=17);
plt.tight_layout();
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_max_depth = did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)").assign(max_depth=lambda df: df.max_depth.astype(int))
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
sns.pointplot(
x="max_depth",
y=metric,
hue="type",
data=res_max_depth,
ax=axes[ax_index]
)
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_lr = did_3_results.query("(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)")
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
sns.pointplot(
x="learning_rate",
y=metric,
hue="type",
data=res_lr,
ax=axes[ax_index]
)
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();
There are multiple ways to analyze the joint impact of this. In this case I'm showing the full joint distribution, i.e. varing every possible hyperparameter
kk = (
did_3_results.query(
"(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
).assign(
num_estimators=lambda df: df.num_estimators.astype("int"),
max_depth=lambda df: df.max_depth.astype("int"),
learning_rate=lambda df: df.learning_rate.round(4),
).query("not(learning_rate == 0.3 & max_depth == 11)")
)
g = sns.catplot(
x="num_estimators",
y="auc_evaluator__target",
col="learning_rate",
hue="max_depth",
kind="point",
data=did_3_results.query(
"(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
).assign(
num_estimators=lambda df: df.num_estimators.astype("int"),
max_depth=lambda df: df.max_depth.astype("int"),
learning_rate=lambda df: df.learning_rate.round(4),
),
col_wrap=2,
height=3.8,
aspect=2,
sharey=False,
markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=18)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
plt.gcf().suptitle("All Hyperparameters", size=17, y=1.02);
# plt.tight_layout();
g = sns.catplot(
x="num_estimators",
y="auc_evaluator__target",
col="type",
hue="max_depth",
kind="point",
data=(
did_3_results.query(
"(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
).assign(
num_estimators=lambda df: df.num_estimators.astype("int"),
max_depth=lambda df: df.max_depth.astype("int")
)
),
col_wrap=1,
height=3.8,
aspect=2,
sharey=False,
markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - num_estimators x max_depth", size=17, y=1.02);
g = sns.catplot(x="learning_rate", y="auc_evaluator__target", col="type", hue="max_depth", kind="point",
data=(did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)")
.assign(max_depth=lambda df: df.max_depth.astype("int"), learning_rate=lambda df: df.learning_rate.round(4))),
col_wrap=1, height=3.8, aspect=2, sharey=False, markers="_")
g.fig.canvas.draw()
for ax in g.axes:
ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - learning_rate x max_depth", size=17, y=1.02);
The meta-statistics
are statistics calculated directly from information about the structure of the dataset and the features. They're not related to a machine learning model, but only to a static dataset. Specifically, they're calculated for each feature of the dataset, and encompass things like variable type, skewness of the data, cardinality of categorical features, etc.
all_dids = get_all_available_experiments()
print(f"Number of available datasets: {len(all_dids)}")
experiment_statistics = build_experiment_stats(all_dids);
There's a high number of Categorical features in OPENML datasets. Probably due to the bioinfo datasets which contains a lot of categorical data
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Feature type distribution", fontsize=17)
experiment_statistics.groupby("var_type").count().top.rename("count").sort_values().plot(kind="barh", ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=17);
Most of the categorical features are binary
cardinality = experiment_statistics[["cardinality"]].query("cardinality != '--'").dropna().astype(int)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Cardinality count of categorical features", fontsize=17)
cardinality.assign(
cardinality=lambda df: df.cardinality.clip(upper=np.percentile(df.cardinality, 95))
).cardinality.value_counts().sort_index().plot(kind="bar", ax=ax);
fig.canvas.draw() # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);
Most of the numerical features have 0 skewness, but a lower percentage have higher skewness
skewness = experiment_statistics[["skewness"]].query("skewness != '--'").dropna().astype(float)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Skewness distribution")
skewness.assign(skewness=lambda df: df.skewness.clip(upper=np.percentile(df.skewness, 95))).plot.hist(
ax=ax,
);
fig.canvas.draw() # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);
To represent a dataset I need to aggregate the feature-wise meta-statistics for each dataset into "aggregated meta-statistics". This way we can represent a dataset as a point in this high dimensional space, and compare points (datasets) together. The aggregated features are calculated on the did_agg
function, and are basically ratio of feature types, number of features, mean of skewness, etc
OBS: variance is defined as: 1 - (v / float(df.shape[0]))
, where v is the number of rows the most common value of a given feature appears. 0 is usually bad.
This is not the typical statistical variance, but rather a measure of how much diversity of values a categorical feature has in the dataset
def did_agg(did_df):
d = {}
did_skewness = did_df.query("skewness != '--'").skewness.dropna().astype(float)
d["num_rows"] = did_df["count"].iloc[0]
d["num_features"] = did_df.T.columns.shape[0]
d["mean_skewness"] = 0.0 if did_skewness.empty else did_skewness.mean()
d["mean_variance"] = did_df.query("variance != '--'").variance.mean()
d["num_categorical"] = did_df["var_type"].isin(["Categorical"]).sum()
d["sum_cardinality_over_categorical"] = did_df.query(
"cardinality == cardinality & cardinality != '--'"
).cardinality.astype(int).sum() / (d["num_categorical"] + 1)
# feature type count
d["categorical_ratio"] = d["num_categorical"] / d["num_features"]
d["numeric_ratio"] = did_df["var_type"].isin(["Numeric"]).sum() / d["num_features"]
d["boolean_ratio"] = did_df["var_type"].isin(["Boolean"]).sum() / d["num_features"]
d["constant_ratio"] = did_df["var_type"].isin(["Constant"]).sum() / d["num_features"]
return pd.Series(
d,
index=[
"num_rows",
"num_features",
"mean_variance",
"mean_skewness",
"num_categorical",
"sum_cardinality_over_categorical",
"categorical_ratio",
"numeric_ratio",
"boolean_ratio",
"constant_ratio",
],
)
I'll cap the aggregated columns for visualizations purposes.
Example of capper - by the 95th percentile
np.percentile(pd.Series(np.arange(100)), 95)
Scatter matrix plot
Checking for interactions between the aggregated meta-statistics
aggregated_stats = experiment_statistics.groupby("did").apply(did_agg)
capped_agg_stats = aggregated_stats.assign(
**{f"{col}": lambda df, i=col: df[i].clip(upper=np.percentile(df[i], 90)) for col in aggregated_stats.columns}
)
pd.plotting.scatter_matrix(
capped_agg_stats, ax=new_axis(title="Scatter matrix for aggregated statistics", figsize=(30, 17))
);
aggregated_stats.shape
aggregated_stats.head()
Visualizing the datasets in lower dimensions
from sklearn.preprocessing import StandardScaler
scaled_agg_stats = StandardScaler().fit_transform(capped_agg_stats)
scaled_agg_stats.shape
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, random_state=42)
tsne_results = tsne.fit_transform(scaled_agg_stats)
t-SNE projections of the dataset
np.random.seed(42)
df_subset = pd.DataFrame()
df_subset["tsne-one"] = tsne_results[:, 0]
df_subset["tsne-two"] = tsne_results[:, 1]
sns.scatterplot(x="tsne-one", y="tsne-two", data=df_subset, ax=new_axis(), s=1000);
To analyze all experiments, first I need to cluster them together into categories that make sense; The initial idea was to cluster them together by hand, manually selecting clusters that have similar meta-statistics distributions; However, this proved to be somewhat difficult when analyzing all meta-statistics together, so I decided to use a clustering approach and check if the selected clusters made sense.
IMPORTANT:
The number of points (datasets) isn't that big, so I can actually check the dendogram and see if the clusters are well behaved.
I used complete
linkage and euclidean metric, using the standardized aggregated statistics, as these were the ones that gave the best clustering structure.
The clustering method, linkage and metrics chosen here can highly impact the analysis I did after. The idea is not to have a generic analysis that can work with any dataset, but to actually measure impact of the hyperparameters into specific clusters that follow a somewhat similar distribution of the aggregated statistics
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
avg_linkage = linkage(scaled_agg_stats, method='centroid', metric='euclidean', optimal_ordering=False)
plt.figure(figsize=(20, 10))
plt.title('Hierarchical Clustering Dendrogram - Euclidean, complete linkage', fontsize=18)
plt.xlabel('Dataset ID')
plt.ylabel('Distance')
dendrogram(avg_linkage, labels=aggregated_stats.index.values, leaf_rotation=90,leaf_font_size=12.0);
from sklearn.cluster import KMeans
kmeans_learner = KMeans(n_clusters=6, n_init=50, random_state=42)
kmeans_clusters = kmeans_learner.fit_predict(scaled_agg_stats)
centroid_projections = TSNE(n_components=2, perplexity=27, verbose=1, random_state=42).fit_transform(np.vstack([scaled_agg_stats, kmeans_learner.cluster_centers_]))
# calculated_clusters = fcluster(avg_linkage, t=4, criterion="distance")
calculated_clusters = np.concatenate(
[kmeans_clusters + 1, np.array(["centroid"] * kmeans_learner.cluster_centers_.shape[0])]
)
df_centroids = pd.DataFrame()
df_centroids["tsne-one"] = centroid_projections[:, 0]
df_centroids["tsne-two"] = centroid_projections[:, 1]
df_centroids["kmeans"] = calculated_clusters
df_centroids["type"] = np.where(df_centroids.kmeans == "centroid", "centroid", "dataset")
ax = new_axis(title="t-SNE projection - Clusters and centroids")
sns.scatterplot(
x="tsne-one",
y="tsne-two",
hue="kmeans",
style="type",
data=df_centroids,
palette=sns.color_palette("hls", len(df_centroids.kmeans.unique())),
ax=ax,
s=1000
)
nclusters = len(np.unique(kmeans_clusters))
print(nclusters)
aggregated_stats["cluster"] = kmeans_clusters + 1
capped_agg_stats["cluster"] = kmeans_clusters + 1
Getting a sense of each cluster
Check if the distribution of aggregated meta-statistics over each cluster is different. This is more than a sanity check: It's a way to understand what are the main differences and similarities between the clusters!
import joypy
from matplotlib.colors import ListedColormap
def plot_cluster_joyplot(aggregated_df, column):
cluster_cmap = ListedColormap(sns.color_palette("hls", nclusters).as_hex())
df = aggregated_df.assign(**{f"{column}": lambda df: df[column] + np.random.random() * 1e-10})
# We don't want to plot a distribution when there's only one distinct value
to_remove = (
aggregated_df.groupby("cluster")[column]
.apply(lambda x: len(x.unique()))
.reset_index()
.query(f"{column} <= 1")
.cluster.values
)
fig, axes = joypy.joyplot(
df.query("cluster not in @to_remove"),
by="cluster",
column=column,
figsize=(10, 10),
colormap=cluster_cmap,
ylim="own",
overlap=0,
)
fig.suptitle(f"Distribution of {column} by cluster", size=18, y=1.05)
aggregated_stats.cluster.values
aggregated_stats.cluster.value_counts().sort_index()
When checking the distribution of the engineered variables the clusters seem to separate datasets in a good manner
for metric in aggregated_stats.columns[:-1]:
plot_cluster_joyplot(capped_agg_stats, column=metric)
Each cluster now has similar datasets, according to the aggregated meta-statistics chosen. The idea is then to analyze, in each cluster, the impact (if there's an impact) of the hyperparameters in the AUC.
For this reasons, I'm calculating the delta_auc
and other metrics (brier
and logloss
) from the baseline. The baseline metric for a given dataset and hyperparameter is the metric we would obtain when training a model using LightGBM default parameters (or the closest one to the default parameters).
def calculate_cluster_object(cluster_id, agg_stat_df):
cluster_dids = agg_stat_df.query(f"cluster == {cluster_id}").index.values
print(cluster_dids.shape)
cluster_exps = {}
for did in cluster_dids:
cluster_exps[did] = build_result_dataframe(open_experiment(did=did).final_result).assign(did=did)
print(cluster_exps.keys())
return pd.concat(cluster_exps.values())
import cloudpickle
import gc
from collections import defaultdict
# for cluster_id in range(1, 7):
# print(f"\n------- Building cluster {cluster_id} -------\n")
# curr_cluster = calculate_cluster_object(cluster_id=cluster_id, agg_stat_df=aggregated_stats)
# open(f"cluster_{cluster_id}.pkl", "wb").write(cloudpickle.dumps(curr_cluster))
# del curr_cluster
# gc.collect()
Dataset names for each cluster
for cluster_id in range(1, 7):
cluster_dids = aggregated_stats.query(f"cluster == {cluster_id}").index.values
print(f"------- {cluster_id} -------")
for did in cluster_dids:
print(f"\t• {open_experiment(did=did).openml_object.name}")
gc.collect()
SAVE_PLOTS_GLOBAL = True
def update_pointplot_xlabels(multiple_of=5, rotation=0, fontsize=20, xticks=True):
"""update the xlabels and xticks of the last axis passed (using plt.gca).
with this function, it can plot xticks/labels with different periods, using the
multiple_of parameter
"""
new_labels, new_xticks = [], []
if xticks:
for i, label in enumerate(plt.gca().get_xticklabels()):
if i % multiple_of == 0:
new_labels.append(label)
for i, tick in enumerate(plt.gca().get_xticks()):
if i % multiple_of == 0:
new_xticks.append(tick)
plt.gca().set_xticks(new_xticks)
else:
for i, label in enumerate(plt.gca().get_xticklabels()):
new_labels.append(label if i % multiple_of == 0 else "")
plt.gca().set_xticklabels(new_labels, rotation=rotation, fontsize=fontsize)
# Plot the deltas calculated from a given analysis
def plot_deltas(df_list, treatment, y="delta_metric", scale=1.0, cap_outliers=False, save_plot=False, loc=0, **kwargs):
for df in df_list:
metric = df.metric.unique()[0]
if df[treatment].dtype == float:
df[treatment] = df[treatment].round(5)
temp_df = df.query(test_pred)
if cap_outliers:
temp_df = df.assign(
**{f"{y}": lambda df: df[y].clip(lower=np.percentile(df[y], 5), upper=np.percentile(df[y], 95))}
)
sns.pointplot(
x=treatment,
y=y,
data=temp_df,
hue="did",
linestyles="",
ci=None,
ax=new_axis(title=f"Delta {metric} (NEW - BASELINE) from LightGBM baseline - Test set", figsize=(22, 10)),
scale=scale,
)
update_pointplot_xlabels(**kwargs)
plt.legend(loc=loc)
if save_plot or SAVE_PLOTS_GLOBAL:
plt.savefig(
"delta_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
bbox_inches="tight",
pad_inches=0,
)
plt.show()
train_pred = "type == 'train'"
test_pred = "type == 'test'"
@fp.curry
def calculate_delta_baseline(did_df, metric="auc_evaluator__target", diff_clause=None):
diff_clause = ["diff_num_est"] if diff_clause is None else diff_clause
if isinstance(diff_clause, tuple):
diff_clause = list(diff_clause)
"""For a given dataframe, calculate the delta of the metric related to the baseline metric,
which is the metric where diff_column is closer to 0.
The dataframe is considered as a group from groupby, where each dataframe contains experiment for a given
dataset id"""
return did_df.assign(
baseline_metric=lambda df: df[metric].loc[df.query(test_pred).sort_values(by=diff_clause).index[0]],
delta_metric=lambda df: df[metric] - df.baseline_metric,
metric=metric
).reset_index(drop=True)
# this is used for multiple experiments
def grouped_experiment(cluster, query_string, name, diff_hyperparam_fn, diff_clause=None):
diff_cols= (f"diff_{name}") if diff_clause is None else diff_clause
if isinstance(diff_cols, list):
diff_cols = tuple(diff_cols)
delta_baseline_auc_fn = calculate_delta_baseline(metric="auc_evaluator__target", diff_clause=diff_cols)
delta_baseline_brier_fn = calculate_delta_baseline(metric="brier_score_evaluator__target", diff_clause=diff_cols)
delta_baseline_logloss_fn = calculate_delta_baseline(metric="logloss_evaluator__target", diff_clause=diff_cols)
grouped_all_dids = (
cluster.query(query_string)
.pipe(diff_hyperparam_fn)
.groupby("did")
)
auc_deltas = grouped_all_dids.apply(delta_baseline_auc_fn)
brier_deltas = grouped_all_dids.apply(delta_baseline_brier_fn)
logloss_deltas = grouped_all_dids.apply(delta_baseline_logloss_fn)
deltas_df_list = [auc_deltas, brier_deltas, logloss_deltas]
print(auc_deltas.shape)
return deltas_df_list
# used to convert a dictionary with the p-values into a dataframe
def build_statistics_df(statistical_dict):
overall_statistics_df = pd.DataFrame()
for treatment, metric_d in statistical_dict.items():
treatment_df = pd.DataFrame()
for metric, statistical_results_d in metric_d.items():
treatment_df = pd.concat(
(
treatment_df,
pd.DataFrame(statistical_results_d, index=[0]).assign(
metric=metric[: metric.find("_evaluator__target")]
),
),
ignore_index=True,
)
overall_statistics_df = pd.concat(
(overall_statistics_df, treatment_df.assign(treatment=treatment)), ignore_index=True
)
return overall_statistics_df
# used to color red the pvalues < 0.05
def color_negative_red(val):
color = 'red' if (not isinstance(val, str) and val < 0.05) else 'black'
return 'color: %s' % color
Before checking the impact of the hyperparameters directly, we'll first test of there's a statistically significant difference in them.
Basically the steps done to check and analyze the data are:
treatment
as the hyperparameter being changed. The value is the delta_metric
"If your data are heteroscedastic, Kruskal–Wallis is no better than one-way anova, and may be worse" - http://www.biostathandbook.com/kruskalwallis.html
def fit_single_factor_model(df, treatment, value_col="value"):
""" Fit a single-factor model where:
y_ij = mu + t_i + eps_ij
The estimates done are:
mu_pred = mean(Y)
t_i_pred = mean(Y_i) - mean(Y), for i = 1, 2, ...n
Intuitively, the overall mean is estimated by the grand average of the observations and
that any treatment effect is just the difference between the treamtment average and the grand average.
"""
# treatment must be the the index of the dataframe
mu_pred = df[value_col].values.mean()
output_cols = ["treatment_mean", "mu_pred", "tau_i_pred"]
return (
df.assign(
treatment_mean=lambda df: df.groupby(treatment).transform(np.mean)[value_col],
tau_i_pred=lambda df: df.treatment_mean - mu_pred,
mu_pred=mu_pred,
)
.groupby(treatment)[output_cols]
.agg("first")
)
import scipy.stats as stats
import pingouin as pg
def plot_residuals(df, sfm_params, treatment, value_col="value", normalize=True, scipy_plot=False):
residuals_df = df.merge(right=sfm_params, on=treatment).assign(
y_treatment_pred=lambda df: df.mu_pred + df.tau_i_pred, residual=lambda df: df[value_col] - df.y_treatment_pred
)
if residuals_df.residual.sum() == 0.0:
print("------ Treatment has no residuals - Cannot apply Single-factor model -------")
return residuals_df
residuals_vector = residuals_df.residual.values
if normalize:
res = (residuals_vector - np.mean(residuals_vector)) / np.std(residuals_vector)
else:
res = residuals_vector
ax = new_axis()
if scipy_plot:
stats.probplot(res, plot=ax)
else:
pg.qqplot(res, dist="norm", ax=ax)
metric = df.metric.unique()[0]
if SAVE_PLOTS_GLOBAL:
plt.savefig(
"qqplot_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
bbox_inches="tight",
pad_inches=0,
)
return residuals_df
def plot_sfm_model(
sfm_model,
treatment,
metric,
residuals_width=3,
num_points_to_highlight=None,
highlight_size=100,
lower_is_better=True,
**kwargs,
):
fig, ax = plt.subplots(figsize=(20, 10))
curr_df = sfm_model.reset_index().sort_values(treatment)
neg_df = curr_df.query("treatment_mean-mu_pred <= 0")
pos_df = curr_df.query("treatment_mean-mu_pred > 0")
# the num of points is equivalent of taking the mean of points in each category and getting 25% of the mean number
num_points_to_highlight = (
int((pos_df.shape[0] + neg_df.shape[0]) * 0.12) if num_points_to_highlight is None else num_points_to_highlight
)
num_points_to_highlight = 5 if num_points_to_highlight <= 0 else num_points_to_highlight
c1, c2, c3, c4, c5 = sns.color_palette("deep")[:5]
# categorical data needs to be treated differently
if pd.api.types.is_categorical_dtype(curr_df[treatment]):
ax.plot(
list(curr_df[treatment].values),
list(curr_df["mu_pred"].values),
color=c1,
linewidth=3,
label="overall mean",
)
else:
ax.plot(curr_df[treatment], curr_df["mu_pred"], color=c1, linewidth=3, label="overall mean")
ax.bar(
pos_df[treatment],
height=pos_df["treatment_mean"] - pos_df["mu_pred"].iloc[0],
bottom=pos_df["mu_pred"].iloc[0],
width=residuals_width,
color=c2,
edgecolor=None,
linewidth=0.1,
)
ax.bar(
neg_df[treatment],
height=neg_df["tau_i_pred"].abs(),
bottom=neg_df["treatment_mean"],
width=residuals_width,
color=c3,
edgecolor=None,
linewidth=0.1,
)
top_treatment_means = pos_df.sort_values(by="treatment_mean", ascending=False).iloc[:num_points_to_highlight]
worst_treatment_means = neg_df.sort_values(by="treatment_mean").iloc[:num_points_to_highlight]
if not top_treatment_means.empty:
ax.scatter(
top_treatment_means[treatment],
top_treatment_means["treatment_mean"],
s=highlight_size,
c=np.array(c4)[:, None].T if lower_is_better else np.array(c5)[:, None].T,
zorder=120,
label="worst treatments" if lower_is_better else "best treatments",
)
if not worst_treatment_means.empty:
ax.scatter(
worst_treatment_means[treatment],
worst_treatment_means["treatment_mean"],
s=highlight_size,
c=np.array(c5)[:, None].T if lower_is_better else np.array(c4)[:, None].T,
zorder=120,
label="best treatments" if lower_is_better else "worst treatments",
)
ax.legend()
ax.set_xlabel(treatment), ax.set_ylabel("delta_metric"), ax.set_title(f"Single-factor model - {metric}", size=18)
if kwargs:
fig.canvas.draw() # draw the figure before updating the xlabels
update_pointplot_xlabels(**kwargs)
if "plot_original" in kwargs:
ax.set_xticks(curr_df[treatment].values)
ax.set_xticklabels(curr_df[treatment].values)
if SAVE_PLOTS_GLOBAL:
plt.savefig(
"sfm_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
bbox_inches="tight",
pad_inches=0,
)
return dict(metric=metric, sfm_df=curr_df)
from IPython.display import display
import scikit_posthocs as sp # perform posthoc tests
def perform_individual_sfm_analysis(
df_list,
treatment,
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
run_sfm_plot=True,
sfm_plot_args=None,
statistical_test_dict=None
):
def index_dataframe(df):
return (
df.query(test_pred)
.reset_index(drop=True)
.assign(
delta_metric=lambda df: df.delta_metric.clip(
lower=np.percentile(df.delta_metric, 5), upper=np.percentile(df.delta_metric, 95)
)
)
)
df_list = [index_dataframe(df) for df in df_list]
sfm_plot_args = {} if sfm_plot_args is None else sfm_plot_args
print(
"Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution\n"
)
print(
"Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal"
)
sp_heatmap_args = {
"linewidths": 0.25,
"linecolor": "0.5",
"clip_on": False,
"square": True,
"cbar_ax_bbox": [0.80, 0.35, 0.04, 0.3],
}
sfm_dict = {}
def loop_over_experiments(function):
return_results = []
for df in df_list:
res = function(df)
if res is not None:
return_results.append(res)
return return_results
# define analysis to be performed for each dataset
def residuals(df):
print(f"Residuals probplot: {df.metric.unique()[0]}")
res = plot_residuals(
df=df,
sfm_params=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric"),
treatment=treatment,
value_col="delta_metric",
)
plt.show()
residuals_shapiro = res.residual.values
if residuals_shapiro.shape[0] > 5000:
residuals_shapiro = res.residual.sample(n=5000, random_state=42)
p_shapiro = stats.shapiro(residuals_shapiro)[1]
if statistical_test_dict is not None:
statistical_test_dict[treatment][df.metric.unique()[0]] = {"shapiro": p_shapiro}
print(f"\nShapiro-Wilk p-value = {p_shapiro}. {'REJECT NORMALITY' if p_shapiro <= 0.05 else 'CANNOT REJECT'}")
def homoscedasticity(df):
print(f"\nHomoscedasticity - Metric: {df.metric.unique()[0]}")
hmscd = pg.homoscedasticity(data=df, dv="delta_metric", group=treatment)
if statistical_test_dict is not None:
statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
statistical_test_dict[treatment][df.metric.unique()[0]],
{"levene": hmscd.pval.unique()[0]}
)
display(hmscd)
def anova(df):
print(f"\nANOVA test - Metric: {df.metric.unique()[0]}")
display(df.anova(dv="delta_metric", between=treatment, detailed=True))
def kruskal(df):
print(f"\nKruskal-wallis test - Metric: {df.metric.unique()[0]}")
sample = []
for d_group_idx in df.groupby(treatment).groups.values():
if len(df.loc[d_group_idx].delta_metric.values) < 2:
continue
sample.append(df.loc[d_group_idx].delta_metric.values)
if len(sample) < 2 or any((len(treatment_group) < 2 for treatment_group in sample)):
print("--- Not enough Samples in treatment group ---")
return {}
print(stats.kruskal(*sample))
_, kruskal_p = stats.kruskal(*sample)
if statistical_test_dict is not None:
statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
statistical_test_dict[treatment][df.metric.unique()[0]],
{"kruskal": kruskal_p}
)
# specific threshold when the group is statistically significant at 95%
if kruskal_p < 0.05:
if run_kruskal_posthoc:
ph_conover = sp.posthoc_conover(df, val_col="delta_metric", group_col=treatment, p_adjust="holm")
sp.sign_plot(
ph_conover,
ax=new_axis(figsize=(10, 8), title=f"Conover p-values for {df.metric.unique()[0]}"),
**sp_heatmap_args,
)
if run_sfm_plot:
delta_lower_is_better = dict(
brier_score_evaluator__target=True, logloss_evaluator__target=True, auc_evaluator__target=False
)
updated_sfm_plot_args = fp.merge(
sfm_plot_args, dict(lower_is_better=delta_lower_is_better[df.metric.unique()[0]])
)
curr_sfm_dict = plot_sfm_model(
sfm_model=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric").reset_index(),
treatment=treatment,
metric=df.metric.unique()[0],
**updated_sfm_plot_args,
)
return curr_sfm_dict
return {}
# apply each analysis to every dataset
if run_residuals:
loop_over_experiments(residuals)
if run_homoscedasticity:
loop_over_experiments(homoscedasticity)
if run_anova:
loop_over_experiments(anova)
if run_kruskal:
all_sfm_models = loop_over_experiments(kruskal)
return all_sfm_models
return None
from collections import defaultdict
all_cluster_results = defaultdict(dict)
cluster_1 = pickle.load(open("cluster_1.pkl", "rb"))
cluster_1.shape
print("Number of datasets:", len(cluster_1.did.unique()))
curr_cluster = 1
to_remove_1 = [1597, 4154, 40922]
cluster_1 = cluster_1.query("did not in @to_remove_1")
st_tests_1 = defaultdict(dict)
individual_num_est_pred = (
"(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
)
def individual_num_est_diff(df):
return df.assign(
# default is 100
num_estimators=lambda df: df.num_estimators.astype(int),
diff_num_est=lambda df: abs(df.num_estimators - 100),
)
num_est_df_list = grouped_experiment(
cluster=cluster_1, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators")
When changing num_estimators
individually, Kruskal-Willis doesn't provide enough evidence to reject null hypothesis
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_1
)
individual_max_depth_pred = (
"(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
)
def individual_max_depth_diff(df):
return df.assign(
max_depth=lambda df: df.max_depth.astype(int),
diff_max_depth=lambda df: abs(df.max_depth - 5), # default is max_depth = 5 because n_leaves = 31
)
max_depth_df_list = grouped_experiment(
cluster=cluster_1,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_1
)
individual_lr_pred = "(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"
def individual_lr_diff(df):
return df.assign(
learning_rate=lambda df: df.learning_rate.astype(float),
diff_lr=lambda df: abs(df.learning_rate - 0.1), # default lr is 0.1
)
lr_df_list = grouped_experiment(
cluster=cluster_1, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=3, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.002, highlight_size=70, multiple_of=1),
statistical_test_dict=st_tests_1
)
When analyzing multiple parameters being run together, the baseline is determined to be the hyperparameter combination with the lowest delta of both of them. Basically, get all the experiments where the diff between hyperparameter a is the minimum, and from those, get the experiment where the hyperparameter b is the minimum, and so on.
Multiple treatments - Utility function
from functools import reduce
def create_multiple_treatment_column(df, columns, treatment_name_output, order_strategy="rank"):
"""When dealing with multiple treatments, this function will create an ordered categorical column,
which represents a given experiment combination o hyperparameters.
The ordering can be done by either passing:
- order_strategy = "rank",
which means that the ordering is based on the mean of the individual ranks
of each hyperparameter
- order_strategy = "column"
which means that the ordering is based on the sort_values result of the
'columns' argument.
"""
def create_categorical_col(df):
all_str_cols = [df[col].map(str) for col in columns]
return df.assign(
**{
f"{treatment_name_output}": pd.Series(
reduce(lambda x, y: x + " - " + y, all_str_cols), dtype="category"
)
}
)
def calc_ranked_mean(df):
# calculate the mean of the ranks of all treatment columns
all_ranks = [df[col].rank() for col in columns]
return df.assign(ranked_mean=reduce(lambda x, y: x + y, all_ranks) / len(all_ranks))
def set_cat_orders(df, ordering_from, categorical_col):
# set the categories order sorting the dataframe by a given 'ordering_from' column
ordering_from = list(ordering_from) if isinstance(ordering_from, tuple) else ordering_from
ord_categories = df.sort_values(by=ordering_from)[categorical_col].unique().astype(str)
return df.assign(
**{
f"{categorical_col}": lambda df: df[categorical_col].cat.reorder_categories(
ord_categories, ordered=True
)
}
)
# the categorical treatment is just a concatenation of all the treatments applied
if order_strategy is "rank":
orders_col = "ranked_mean"
return (
df.pipe(create_categorical_col)
.pipe(calc_ranked_mean)
.pipe(set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output)
).drop(columns=orders_col)
elif order_strategy is "column":
orders_col = tuple(columns)
return df.pipe(create_categorical_col).pipe(
set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output
)
else:
raise ValueError("Invalid ordering strategy! Needs to be 'rank' or 'column'")
max_depth_lr_pred = "(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_max_depth", "diff_lr")
def max_depth_lr_diff(df):
return df.assign(
learning_rate=lambda df: df.learning_rate.astype(float),
max_depth=lambda df: df.max_depth.astype(int),
diff_lr=lambda df: abs(df.learning_rate - 0.1), # default lr is 0.1
diff_max_depth=lambda df: abs(df.max_depth - 5), # default is max_depth = 5 because n_leaves = 31
)
max_depth_lr_list = grouped_experiment(
cluster=cluster_1,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
statistical_test_dict=st_tests_1
)
max_depth_num_est_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
# minimize the difference of max_depth then the num_estimators
diff_clause = ("diff_max_depth", "diff_num_est")
def max_depth_num_est_diff(df):
return df.assign(
max_depth=lambda df: df.max_depth.astype(int),
num_estimators=lambda df: df.num_estimators.astype(int),
diff_max_depth=lambda df: abs(df.max_depth - 5), # default is max_depth = 5 because n_leaves = 31
diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
)
max_depth_num_est_list = grouped_experiment(
cluster=cluster_1,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=50, rotation=90, loc=1, scale=.8)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
statistical_test_dict=st_tests_1
)
num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_num_est", "diff_lr")
def num_est_lr_diff(df):
return df.assign(
num_estimators=lambda df: df.num_estimators.astype(int),
learning_rate=lambda df: df.learning_rate.astype(float),
diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
diff_lr=lambda df: abs(df.learning_rate - 0.1), # default lr is 0.1
)
num_est_lr_list = grouped_experiment(
cluster=cluster_1,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=2.5, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
statistical_test_dict=st_tests_1
)
max_depth_num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
def max_depth_num_est_lr_diff(df):
return df.assign(
max_depth=lambda df: df.max_depth.astype(int),
num_estimators=lambda df: df.num_estimators.astype(int),
learning_rate=lambda df: df.learning_rate.astype(float),
diff_max_depth=lambda df: abs(df.max_depth - 5), # default is max_depth = 5 because n_leaves = 31
diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
diff_lr=lambda df: abs(df.learning_rate - 0.1), # default lr is 0.1
)
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_1,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=300, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
statistical_test_dict=st_tests_1
)
del cluster_1
gc.collect()
st_tests_2 = defaultdict(dict)
cluster_2 = pickle.load(open("cluster_2.pkl", "rb"))
cluster_2.shape
print("Number of datasets:", len(cluster_2.did.unique()))
curr_cluster = 2
to_remove_2 = [40517]
cluster_2 = cluster_2.query("did not in @to_remove_2")
num_est_df_list = grouped_experiment(
cluster=cluster_2, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3, save_plot=True)
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_2
)
max_depth_df_list = grouped_experiment(
cluster=cluster_2,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_2
)
lr_df_list = grouped_experiment(
cluster=cluster_2, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
statistical_test_dict=st_tests_2
)
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
cluster=cluster_2,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=10, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=10, fontsize=15, rotation=90),
statistical_test_dict=st_tests_2
)
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
cluster=cluster_2,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
statistical_test_dict=st_tests_2
)
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
cluster=cluster_2,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
statistical_test_dict=st_tests_2
)
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_2,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
statistical_test_dict=st_tests_2
)
del cluster_2
gc.collect()
cluster_3 = pickle.load(open("cluster_3.pkl", "rb"))
cluster_3.shape
print("Number of datasets:", len(cluster_3.did.unique()))
curr_cluster = 3
st_tests_3 = defaultdict(dict)
num_est_df_list = grouped_experiment(
cluster=cluster_3, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_3
)
max_depth_df_list = grouped_experiment(
cluster=cluster_3,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_3
)
lr_df_list = grouped_experiment(
cluster=cluster_3, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
statistical_test_dict=st_tests_3
)
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
cluster=cluster_3,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=5, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=5, fontsize=15, rotation=90),
statistical_test_dict=st_tests_3
)
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
cluster=cluster_3,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
statistical_test_dict=st_tests_3
)
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
cluster=cluster_3,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=5, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=2, fontsize=15, rotation=90),
statistical_test_dict=st_tests_3
)
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_3,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
statistical_test_dict=st_tests_3
)
del cluster_3
gc.collect()
cluster_4 = pickle.load(open("cluster_4.pkl", "rb"))
cluster_4.shape
print("Number of datasets:", len(cluster_4.did.unique()))
curr_cluster = 4
st_tests_4 = defaultdict(dict)
num_est_df_list = grouped_experiment(
cluster=cluster_4, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_4
)
max_depth_df_list = grouped_experiment(
cluster=cluster_4,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_4
)
lr_df_list = grouped_experiment(
cluster=cluster_4, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
statistical_test_dict=st_tests_4
)
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
cluster=cluster_4,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=15, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
statistical_test_dict=st_tests_4
)
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
cluster=cluster_4,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
statistical_test_dict=st_tests_4
)
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
cluster=cluster_4,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=20, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=20, fontsize=15, rotation=90),
statistical_test_dict=st_tests_4
)
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_4,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=200, cap_outliers=True, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=200, fontsize=15, rotation=90),
statistical_test_dict=st_tests_4
)
del cluster_4
gc.collect()
cluster_5 = pickle.load(open("cluster_5.pkl", "rb"))
cluster_5.shape
print("Number of datasets:", len(cluster_5.did.unique()))
curr_cluster = 5
st_tests_5 = defaultdict(dict)
num_est_df_list = grouped_experiment(
cluster=cluster_5, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=10, rotation=90)
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_5
)
max_depth_df_list = grouped_experiment(
cluster=cluster_5,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_5
)
lr_df_list = grouped_experiment(
cluster=cluster_5, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.003, highlight_size=200, multiple_of=5),
statistical_test_dict=st_tests_5
)
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
cluster=cluster_5,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=50, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
statistical_test_dict=st_tests_5
)
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
cluster=cluster_5,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
statistical_test_dict=st_tests_5
)
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
cluster=cluster_5,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
statistical_test_dict=st_tests_5
)
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_5,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=450, fontsize=15, rotation=90),
statistical_test_dict=st_tests_5
)
del cluster_5
gc.collect()
cluster_6 = pickle.load(open("cluster_6.pkl", "rb"))
cluster_6.shape
print("Number of datasets:", len(cluster_6.did.unique()))
curr_cluster = 6
to_remove_6 = [1069]
cluster_6 = cluster_6.query("did not in @to_remove_6")
st_tests_6 = defaultdict(dict)
num_est_df_list = grouped_experiment(
cluster=cluster_6, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3)
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
num_est_df_list,
treatment="num_estimators",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
statistical_test_dict=st_tests_6
)
max_depth_df_list = grouped_experiment(
cluster=cluster_6,
query_string=individual_max_depth_pred,
name="max_depth",
diff_hyperparam_fn=individual_max_depth_diff,
)
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
max_depth_df_list,
treatment="max_depth",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
statistical_test_dict=st_tests_6
)
lr_df_list = grouped_experiment(
cluster=cluster_6, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
lr_df_list,
treatment="learning_rate",
run_residuals=True,
run_homoscedasticity=True,
run_anova=True,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
statistical_test_dict=st_tests_6
)
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
cluster=cluster_6,
query_string=max_depth_lr_pred,
name="max_depth_lr",
diff_hyperparam_fn=max_depth_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "learning_rate"],
treatment_name_output="max_depth_lr",
order_strategy="rank",
),
max_depth_lr_list,
)
)
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
max_depth_lr_list,
treatment="max_depth_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
statistical_test_dict=st_tests_6
)
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
cluster=cluster_6,
query_string=max_depth_num_est_pred,
name="max_depth_num_est",
diff_hyperparam_fn=max_depth_num_est_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df,
columns=["max_depth", "num_estimators"],
treatment_name_output="max_depth_num_est",
order_strategy="rank",
),
max_depth_num_est_list,
)
)
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=20, rotation=90)
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
max_depth_num_est_list,
treatment="max_depth_num_est",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
statistical_test_dict=st_tests_6
)
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
cluster=cluster_6,
query_string=num_est_lr_pred,
name="num_est_lr",
diff_hyperparam_fn=num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["num_estimators", "learning_rate"],
treatment_name_output="num_est_lr",
order_strategy="rank",
),
num_est_lr_list,
)
)
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
num_est_lr_list,
treatment="num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
statistical_test_dict=st_tests_6
)
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
cluster=cluster_6,
query_string=max_depth_num_est_lr_pred,
name="max_depth_num_est_lr",
diff_hyperparam_fn=max_depth_num_est_lr_diff,
diff_clause=diff_clause,
)
# create new treatment for all results
max_depth_num_est_lr_list = list(
map(
lambda df: create_multiple_treatment_column(
df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
columns=["max_depth", "num_estimators", "learning_rate"],
treatment_name_output="max_depth_num_est_lr",
order_strategy="rank",
),
max_depth_num_est_lr_list,
)
)
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
max_depth_num_est_lr_list,
treatment="max_depth_num_est_lr",
run_residuals=True,
run_homoscedasticity=True,
run_anova=False,
run_kruskal=True,
run_kruskal_posthoc=False,
sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
statistical_test_dict=st_tests_6
)
del cluster_6
gc.collect()
# open("all_cluster_results.pkl", "wb").write(cloudpickle.dumps(all_cluster_results))
all_cluster_results = cloudpickle.load(open("all_cluster_results.pkl", "rb"))
all_cluster_results.keys()
### mapping dicts
hyperparam_combination_map = dict(
num_estimators="NE",
max_depth="MD",
learning_rate="LR",
max_depth_lr="MD, LR",
num_est_lr="LR, NE",
max_depth_num_est="MD, NE",
max_depth_num_est_lr="NE, MD, LR",
)
perf_metric_map = dict(auc="AUC", brier="Brier", logloss="Logloss")
### ordering dicts
hyperparam_combination_order = dict(
num_estimators=1,
max_depth=2,
learning_rate=3,
max_depth_lr=4,
max_depth_num_est=5,
num_est_lr=6,
max_depth_num_est_lr=7,
)
perf_metric_order = dict(auc=1, brier=2, logloss=2)
def created_filtered_hyperparam_dict(clusters_results_d, key="num_estimators"):
results = {}
for cluster, c_dict in clusters_results_d.items():
curr_filt_dict = fp.keyfilter(lambda el: el == key, c_dict)
if curr_filt_dict[key] != [{}, {}, {}]:
results[cluster] = curr_filt_dict
return results
def count_significance(filt_hp_dict, key="num_estimators", metric=None, normalize_by=1):
if metric is None:
metric = defaultdict(int)
for hp_d in filt_hp_dict.values():
for d in hp_d[key]:
metric[d.get("metric", "NaN")] += 1
return fp.valmap(lambda v: (v/normalize_by)*100, metric)
# filt_ne_dict = created_filtered_hyperparam_dict(all_cluster_results, key="num_estimators")
# each num_est analysis have all three metrics. dividing the count of each metric by six will give us the proportion of
# how much that metric was considered significant according to kruskll-wallis
# count_significance(filt_ne_dict, key="num_estimators", normalize_by=6)
for key in sorted(all_cluster_results[1].keys(), key=lambda x: x.count("_")):
print(f"-------- {key} by metric proportion ---------")
count_significance(
created_filtered_hyperparam_dict(all_cluster_results, key=key),
key=key, normalize_by=6
)