Imports¶

from IPython.core.interactiveshell import InteractiveShell #display full output instead of just the last one
InteractiveShell.ast_node_interactivity = "all"

import toolz as fp
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cloudpickle
import gc
sns.set()
%matplotlib inline
pd.set_option('display.max_rows', 50)
palette = sns.hls_palette(8, h=.7, s=.9, l=0.5)
sns.palplot(palette[::-1])
sns.set_palette(palette[::-1])

gray = "444444"

plt.rcParams['figure.facecolor'] = '1'  # background color
plt.rcParams['axes.facecolor'] = '1'  # plot background color

plt.rcParams['grid.color'] = '0.8'
plt.rcParams['grid.alpha'] = .4

plt.rcParams['axes.edgecolor'] = '1'

plt.rcParams['lines.linewidth'] = 1
plt.rcParams['grid.linestyle'] = '-'

plt.rcParams['axes.axisbelow'] = True
plt.rcParams['axes.labelcolor'] = gray

plt.rcParams['text.color'] = gray

plt.rcParams['xtick.color'] = gray
plt.rcParams['ytick.color'] = gray
sns.set_style("whitegrid")
sns.set_context("notebook")
%config InlineBackend.figure_format = "retina"

import pickle
from pathlib import Path

class dotdict(dict):
    """dot.notation access to dictionary attributes"""

    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
@fp.curry
def open_pickled_object(file, full_path):
    """
    Open a pickled object by specifying the full_path and then the filename
    Parameters
    ----------
    file : str
        String representing the filename
    full_path : int
        String representing the path to the file
    """
    return pickle.load(open(full_path + "/" + file, "rb"))

def open_experiment(did, model_type="BINARY"):
    """
    Open the experiment files from a single dataset by providing the ID
    Parameters
    ----------
    did : int
        the dataset id
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../"+model_type
    full_path = base_path + "/" + str(did)
    open_object = open_pickled_object(full_path=full_path)
    if Path(base_path).is_dir() and Path(full_path).is_dir():
        return dotdict(
            dict(
                analyzer_info=open_object("analyzer_info.pkl"),
                df_stats=open_object("df_stats.pkl"),
                final_result=open_object("final_result.pkl"),
                hp_tree=open_object("hp_tree.pkl"),
                openml_object=open_object("openml_object.pkl"),
                shape=open_object("shape.pkl"),
            )
        )
    return {}

def get_all_available_experiments(model_type="BINARY"):
    """
    Return a list of the datasets ID's available by reading the folder
    Parameters
    ----------
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../" + model_type
    path = Path(base_path)
    return [int(x.name) for x in path.iterdir() if x.is_dir()]


def new_axis(figsize=(15, 8), title=None):
    """
    Creates a new matplotlib axis and returns it
    Parameters
    ----------
    figsize : Tuple(Numeric, Numeric)
        the figure size
    title : str
        the figure title
    """
    _, ax = plt.subplots(figsize=figsize)
    if title is not None:
        ax.set_title(title, size=17)
    return ax

def build_experiment_stats(did_list, model_type="BINARY"):
    """
    Return the meta-statistics from the provided datasets ids
    Parameters
    ----------
    did_list : List[int]
        list of dataset ids
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../" + model_type
    if not Path(base_path).is_dir():
        raise ValueError("Base directory doesn't exist")
    stats_df = pd.DataFrame()
    open_stats = open_pickled_object(file="df_stats.pkl")
    for did in did_list:
        full_path = base_path + "/" + str(did)
        full_path_posix = Path(base_path + "/" + str(did))
        if full_path_posix.is_dir():
            stats_df = pd.concat(
                (stats_df,
                 open_stats(full_path=full_path).T.assign(did=did)
                 )
            )
    return stats_df

Open an individual experiment¶

A single experiment containts all the results of changing the hyperparameters, along with the meta-statistics for a given dataset. The original dataset and information is stored in the openml_object

experiment = open_experiment(did=72)
experiment.keys()

dict_keys(['analyzer_info', 'df_stats', 'final_result', 'hp_tree', 'openml_object', 'shape'])

experiment.openml_object.name

'BNG(kr-vs-kp)'

Important way to select the experiments:

If a given factor is NaN in the dataframe, it means that in that specific experiment run, that parameter was't changed!

I'm analyzing three parameters: max_depth, num_estimators and learning_rate. So if want to get the dataframe of the experiments where the only factor analyzed is the max_depth for example, I get all rows where max_depth is not NaN but num_estimators and learning_rate are.

def build_result_dataframe(log_list):
    """
    Given a log with all experiment results of a given dataset, this function returns the dataset concatenated
    Parameters
    ----------
    log_list : List[dict]
        a list of experiment logs
    """
    all_experiments = []
    for base_result in log_list:
        train_key, test_key = "train_result", "test_result"
        base_df = pd.DataFrame(fp.dissoc(base_result, train_key, test_key), index=range(2))
        metrics_df = pd.concat((pd.DataFrame(base_result[train_key], index=[0]).assign(type="train"),
                                pd.DataFrame(base_result[test_key], index=[0]).assign(type="test"))).reset_index(drop=True)
        all_experiments.append(pd.concat((base_df, metrics_df), axis=1))
    return pd.concat((exp for exp in all_experiments), sort=False, ignore_index=True)

did_3_results = build_result_dataframe(experiment.final_result)
did_3_results.shape
did_3_results.head()

(5290, 12)

Analyze impact of hyperparameters¶

We can look at each dataset to check the results of different parameters.

Isolated Impact¶

TIP: Using Panda's .query() method, we can use col == col to filter when col isn't NaN, and col != col to filter when col IS NaN.

Num_estimators¶

_, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 10))
axes = axes.ravel()
res_num_est = did_3_results.query(
    "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
).assign(num_estimators=lambda df: df.num_estimators.astype(int))
for ax_index, metric in enumerate(
    ("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")
):
    sns.pointplot(x="num_estimators", y=metric, hue="type", data=res_num_est, ax=axes[ax_index])
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
# ax.set_title("Num_estimators impact on AUC", size=17);
plt.tight_layout();

max_depth¶

_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_max_depth = did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)").assign(max_depth=lambda df: df.max_depth.astype(int))
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
    sns.pointplot(
        x="max_depth",
        y=metric,
        hue="type",
        data=res_max_depth,
        ax=axes[ax_index]
    )
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();

learning_rate¶

_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_lr = did_3_results.query("(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)")
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
    sns.pointplot(
        x="learning_rate",
        y=metric,
        hue="type",
        data=res_lr,
        ax=axes[ax_index]
    )
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();

Joint Impact¶

There are multiple ways to analyze the joint impact of this. In this case I'm showing the full joint distribution, i.e. varing every possible hyperparameter

kk = (
        did_3_results.query(
            "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
        ).assign(
            num_estimators=lambda df: df.num_estimators.astype("int"),
            max_depth=lambda df: df.max_depth.astype("int"),
            learning_rate=lambda df: df.learning_rate.round(4),
        ).query("not(learning_rate == 0.3 & max_depth == 11)")
    )

g = sns.catplot(
    x="num_estimators",
    y="auc_evaluator__target",
    col="learning_rate",
    hue="max_depth",
    kind="point",
    data=did_3_results.query(
        "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
    ).assign(
        num_estimators=lambda df: df.num_estimators.astype("int"),
        max_depth=lambda df: df.max_depth.astype("int"),
        learning_rate=lambda df: df.learning_rate.round(4),
    ),
    col_wrap=2,
    height=3.8,
    aspect=2,
    sharey=False,
    markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=18)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
plt.gcf().suptitle("All Hyperparameters", size=17, y=1.02);
# plt.tight_layout();

g = sns.catplot(
    x="num_estimators",
    y="auc_evaluator__target",
    col="type",
    hue="max_depth",
    kind="point",
    data=(
        did_3_results.query(
            "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
        ).assign(
            num_estimators=lambda df: df.num_estimators.astype("int"),
            max_depth=lambda df: df.max_depth.astype("int")
        )
    ),
    col_wrap=1,
    height=3.8,
    aspect=2,
    sharey=False,
    markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - num_estimators x max_depth", size=17, y=1.02);

g = sns.catplot(x="learning_rate", y="auc_evaluator__target", col="type", hue="max_depth", kind="point",
            data=(did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)")
                  .assign(max_depth=lambda df: df.max_depth.astype("int"), learning_rate=lambda df: df.learning_rate.round(4))),
            col_wrap=1, height=3.8, aspect=2, sharey=False, markers="_")
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - learning_rate x max_depth", size=17, y=1.02);

All experiments: meta-statistics analysis¶

The meta-statistics are statistics calculated directly from information about the structure of the dataset and the features. They're not related to a machine learning model, but only to a static dataset. Specifically, they're calculated for each feature of the dataset, and encompass things like variable type, skewness of the data, cardinality of categorical features, etc.

all_dids = get_all_available_experiments()
print(f"Number of available datasets: {len(all_dids)}")
experiment_statistics = build_experiment_stats(all_dids);

Number of available datasets: 70

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/ipykernel_launcher.py:22: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

There's a high number of Categorical features in OPENML datasets. Probably due to the bioinfo datasets which contains a lot of categorical data

fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Feature type distribution", fontsize=17)
experiment_statistics.groupby("var_type").count().top.rename("count").sort_values().plot(kind="barh", ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=17);

Most of the categorical features are binary

cardinality = experiment_statistics[["cardinality"]].query("cardinality != '--'").dropna().astype(int)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Cardinality count of categorical features", fontsize=17)
cardinality.assign(
    cardinality=lambda df: df.cardinality.clip(upper=np.percentile(df.cardinality, 95))
).cardinality.value_counts().sort_index().plot(kind="bar", ax=ax);
fig.canvas.draw()  # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);

Most of the numerical features have 0 skewness, but a lower percentage have higher skewness

skewness = experiment_statistics[["skewness"]].query("skewness != '--'").dropna().astype(float)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Skewness distribution")
skewness.assign(skewness=lambda df: df.skewness.clip(upper=np.percentile(df.skewness, 95))).plot.hist(
    ax=ax,
);
fig.canvas.draw()  # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);

Aggregated meta-statistics by dataset¶

To represent a dataset I need to aggregate the feature-wise meta-statistics for each dataset into "aggregated meta-statistics". This way we can represent a dataset as a point in this high dimensional space, and compare points (datasets) together. The aggregated features are calculated on the did_agg function, and are basically ratio of feature types, number of features, mean of skewness, etc

OBS: variance is defined as: 1 - (v / float(df.shape[0])), where v is the number of rows the most common value of a given feature appears. 0 is usually bad.

This is not the typical statistical variance, but rather a measure of how much diversity of values a categorical feature has in the dataset

def did_agg(did_df):
    d = {}
    did_skewness = did_df.query("skewness != '--'").skewness.dropna().astype(float)
    d["num_rows"] = did_df["count"].iloc[0]
    d["num_features"] = did_df.T.columns.shape[0]
    d["mean_skewness"] = 0.0 if did_skewness.empty else did_skewness.mean()
    d["mean_variance"] = did_df.query("variance != '--'").variance.mean()
    d["num_categorical"] = did_df["var_type"].isin(["Categorical"]).sum()
    d["sum_cardinality_over_categorical"] = did_df.query(
        "cardinality == cardinality & cardinality != '--'"
    ).cardinality.astype(int).sum() / (d["num_categorical"] + 1)
    # feature type count
    d["categorical_ratio"] = d["num_categorical"] / d["num_features"]    
    d["numeric_ratio"] = did_df["var_type"].isin(["Numeric"]).sum() / d["num_features"]    
    d["boolean_ratio"] = did_df["var_type"].isin(["Boolean"]).sum() / d["num_features"]    
    d["constant_ratio"] = did_df["var_type"].isin(["Constant"]).sum() / d["num_features"]    

    return pd.Series(
        d,
        index=[
            "num_rows",
            "num_features",
            "mean_variance",
            "mean_skewness",
            "num_categorical",
            "sum_cardinality_over_categorical",
            "categorical_ratio",
            "numeric_ratio",
            "boolean_ratio",
            "constant_ratio",
        ],
    )

I'll cap the aggregated columns for visualizations purposes.

Example of capper - by the 95th percentile

np.percentile(pd.Series(np.arange(100)), 95)

94.05

Scatter matrix plot

Checking for interactions between the aggregated meta-statistics

aggregated_stats = experiment_statistics.groupby("did").apply(did_agg)
capped_agg_stats = aggregated_stats.assign(
    **{f"{col}": lambda df, i=col: df[i].clip(upper=np.percentile(df[i], 90)) for col in aggregated_stats.columns}
)

pd.plotting.scatter_matrix(
    capped_agg_stats, ax=new_axis(title="Scatter matrix for aggregated statistics", figsize=(30, 17))
);

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/ipykernel_launcher.py:7: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  import sys

aggregated_stats.shape
aggregated_stats.head()

(70, 10)

Visualizing the datasets in lower dimensions

from sklearn.preprocessing import StandardScaler
scaled_agg_stats = StandardScaler().fit_transform(capped_agg_stats)
scaled_agg_stats.shape

(70, 10)

from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, random_state=42)
tsne_results = tsne.fit_transform(scaled_agg_stats)

[t-SNE] Computing 69 nearest neighbors...
[t-SNE] Indexed 70 samples in 0.000s...
[t-SNE] Computed neighbors for 70 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 70 / 70
[t-SNE] Mean sigma: 2.354606
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.328487
[t-SNE] KL divergence after 1000 iterations: 0.258376

t-SNE projections of the dataset

np.random.seed(42)
df_subset = pd.DataFrame()
df_subset["tsne-one"] = tsne_results[:, 0]
df_subset["tsne-two"] = tsne_results[:, 1]
sns.scatterplot(x="tsne-one", y="tsne-two", data=df_subset, ax=new_axis(), s=1000);

Simple clustering¶

To analyze all experiments, first I need to cluster them together into categories that make sense; The initial idea was to cluster them together by hand, manually selecting clusters that have similar meta-statistics distributions; However, this proved to be somewhat difficult when analyzing all meta-statistics together, so I decided to use a clustering approach and check if the selected clusters made sense.

IMPORTANT:

The number of points (datasets) isn't that big, so I can actually check the dendogram and see if the clusters are well behaved.

I used complete linkage and euclidean metric, using the standardized aggregated statistics, as these were the ones that gave the best clustering structure.

The clustering method, linkage and metrics chosen here can highly impact the analysis I did after. The idea is not to have a generic analysis that can work with any dataset, but to actually measure impact of the hyperparameters into specific clusters that follow a somewhat similar distribution of the aggregated statistics

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
avg_linkage = linkage(scaled_agg_stats, method='centroid', metric='euclidean', optimal_ordering=False)
plt.figure(figsize=(20, 10))
plt.title('Hierarchical Clustering Dendrogram - Euclidean, complete linkage', fontsize=18)
plt.xlabel('Dataset ID')
plt.ylabel('Distance')
dendrogram(avg_linkage, labels=aggregated_stats.index.values, leaf_rotation=90,leaf_font_size=12.0);

from sklearn.cluster import KMeans
kmeans_learner = KMeans(n_clusters=6, n_init=50, random_state=42)

kmeans_clusters = kmeans_learner.fit_predict(scaled_agg_stats)

centroid_projections = TSNE(n_components=2, perplexity=27, verbose=1, random_state=42).fit_transform(np.vstack([scaled_agg_stats, kmeans_learner.cluster_centers_]))

[t-SNE] Computing 75 nearest neighbors...
[t-SNE] Indexed 76 samples in 0.000s...
[t-SNE] Computed neighbors for 76 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 76 / 76
[t-SNE] Mean sigma: 2.138719
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.366253
[t-SNE] KL divergence after 1000 iterations: 0.216709

# calculated_clusters = fcluster(avg_linkage, t=4, criterion="distance")
calculated_clusters = np.concatenate(
    [kmeans_clusters + 1, np.array(["centroid"] * kmeans_learner.cluster_centers_.shape[0])]
)

df_centroids = pd.DataFrame()
df_centroids["tsne-one"] = centroid_projections[:, 0]
df_centroids["tsne-two"] = centroid_projections[:, 1]
df_centroids["kmeans"] = calculated_clusters
df_centroids["type"] = np.where(df_centroids.kmeans == "centroid", "centroid", "dataset")
ax = new_axis(title="t-SNE projection - Clusters and centroids")
sns.scatterplot(
    x="tsne-one",
    y="tsne-two",
    hue="kmeans",
    style="type",
    data=df_centroids,
    palette=sns.color_palette("hls", len(df_centroids.kmeans.unique())),
    ax=ax,
    s=1000
)

<matplotlib.axes._subplots.AxesSubplot at 0x123bed9e8>

nclusters = len(np.unique(kmeans_clusters))
print(nclusters)
aggregated_stats["cluster"] = kmeans_clusters + 1
capped_agg_stats["cluster"] = kmeans_clusters + 1

6

Getting a sense of each cluster

Check if the distribution of aggregated meta-statistics over each cluster is different. This is more than a sanity check: It's a way to understand what are the main differences and similarities between the clusters!

import joypy
from matplotlib.colors import ListedColormap


def plot_cluster_joyplot(aggregated_df, column):
    cluster_cmap = ListedColormap(sns.color_palette("hls", nclusters).as_hex())
    df = aggregated_df.assign(**{f"{column}": lambda df: df[column] + np.random.random() * 1e-10})
    # We don't want to plot a distribution when there's only one distinct value
    to_remove = (
        aggregated_df.groupby("cluster")[column]
        .apply(lambda x: len(x.unique()))
        .reset_index()
        .query(f"{column} <= 1")
        .cluster.values
    )
    fig, axes = joypy.joyplot(
        df.query("cluster not in @to_remove"),
        by="cluster",
        column=column,
        figsize=(10, 10),
        colormap=cluster_cmap,
        ylim="own",
        overlap=0,
    )
    fig.suptitle(f"Distribution of {column} by cluster", size=18, y=1.05)

aggregated_stats.cluster.values

array([2, 5, 6, 3, 3, 3, 3, 3, 3, 3, 6, 6, 1, 5, 2, 6, 5, 1, 1, 1, 1, 1,
       1, 1, 6, 5, 5, 1, 1, 1, 6, 2, 5, 6, 6, 6, 1, 5, 1, 6, 5, 1, 5, 1,
       6, 5, 1, 2, 1, 1, 3, 3, 2, 5, 1, 4, 6, 5, 4, 1, 1, 1, 2, 6, 4, 4,
       4, 6, 1, 6], dtype=int32)

aggregated_stats.cluster.value_counts().sort_index()

1    23
2     6
3     9
4     5
5    12
6    15
Name: cluster, dtype: int64

When checking the distribution of the engineered variables the clusters seem to separate datasets in a good manner

for metric in aggregated_stats.columns[:-1]:
    plot_cluster_joyplot(capped_agg_stats, column=metric)

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/matplotlib/figure.py:98: MatplotlibDeprecationWarning: 
Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  "Adding an axes using the same arguments as a previous axes "

Cluster specific Analysis¶

Each cluster now has similar datasets, according to the aggregated meta-statistics chosen. The idea is then to analyze, in each cluster, the impact (if there's an impact) of the hyperparameters in the AUC. For this reasons, I'm calculating the delta_auc and other metrics (brier and logloss) from the baseline. The baseline metric for a given dataset and hyperparameter is the metric we would obtain when training a model using LightGBM default parameters (or the closest one to the default parameters).

def calculate_cluster_object(cluster_id, agg_stat_df):
    cluster_dids = agg_stat_df.query(f"cluster == {cluster_id}").index.values
    print(cluster_dids.shape)
    cluster_exps = {}
    for did in cluster_dids:
        cluster_exps[did] = build_result_dataframe(open_experiment(did=did).final_result).assign(did=did)
    print(cluster_exps.keys())
    return pd.concat(cluster_exps.values())

import cloudpickle
import gc
from collections import defaultdict
# for cluster_id in range(1, 7):
#     print(f"\n------- Building cluster {cluster_id} -------\n")
#     curr_cluster = calculate_cluster_object(cluster_id=cluster_id, agg_stat_df=aggregated_stats)
#     open(f"cluster_{cluster_id}.pkl", "wb").write(cloudpickle.dumps(curr_cluster))
#     del curr_cluster
#     gc.collect()

Dataset names for each cluster

for cluster_id in range(1, 7):
    cluster_dids = aggregated_stats.query(f"cluster == {cluster_id}").index.values
    print(f"------- {cluster_id} -------")
    for did in cluster_dids:
        print(f"\t• {open_experiment(did=did).openml_object.name}")

gc.collect()

------- 1 -------
	• visualizing_soil
	• mfeat-morphological
	• analcatdata_halloffame
	• mfeat-fourier
	• JapaneseVowels
	• letter
	• mfeat-factors
	• waveform-5000
	• mfeat-zernike
	• pendigits
	• mfeat-karhunen
	• musk
	• MagicTelescope
	• hill-valley
	• creditcard
	• CreditCardSubset
	• higgs
	• numerai28.6
	• churn
	• Satellite
	• Speech
	• Run_or_walk_information
	• USPS
------- 2 -------
	• kr-vs-kp
	• splice
	• mfeat-pixel
	• PhishingWebsites
	• 20_newsgroups.drift
	• Internet-Advertisements
------- 3 -------
	• BNG(kr-vs-kp)
	• BNG(labor,nominal,1000000)
	• BNG(breast-cancer,nominal,1000000)
	• BNG(mushroom)
	• BNG(colic.ORIG,nominal,1000000)
	• BNG(credit-a,nominal,1000000)
	• BNG(credit-g,nominal,1000000)
	• BNG(credit-g)
	• BNG(spambase)
------- 4 -------
	• solar-flare
	• dis
	• jungle_chess_2pcs_endgame_elephant_elephant
	• jungle_chess_2pcs_endgame_rat_rat
	• jungle_chess_2pcs_endgame_lion_lion
------- 5 -------
	• credit-g
	• socmob
	• nursery
	• cmc
	• car
	• ada_prior
	• adult-census
	• bank-marketing
	• adult
	• Amazon_employee_access
	• mofn-3-7-10
	• parity5_plus_5
------- 6 -------
	• spambase
	• cpu_small
	• elevators
	• segment
	• optdigits
	• page-blocks
	• kc1
	• pc1
	• pc2
	• autoUniv-au1-1000
	• Bioresponse
	• Titanic
	• wilt
	• Sick_numeric
	• telco-customer-churn

0

Toggle figure saving¶

SAVE_PLOTS_GLOBAL = True

General utility functions¶

def update_pointplot_xlabels(multiple_of=5, rotation=0, fontsize=20, xticks=True):
    """update the xlabels and xticks of the last axis passed (using plt.gca).
        with this function, it can plot xticks/labels with different periods, using the
        multiple_of parameter
    """
    new_labels, new_xticks = [], []
    if xticks:
        for i, label in enumerate(plt.gca().get_xticklabels()):
            if i % multiple_of == 0:
                new_labels.append(label)
        for i, tick in enumerate(plt.gca().get_xticks()):
            if i % multiple_of == 0:
                new_xticks.append(tick)
        plt.gca().set_xticks(new_xticks)
    else:
        for i, label in enumerate(plt.gca().get_xticklabels()):
            new_labels.append(label if i % multiple_of == 0 else "")
    plt.gca().set_xticklabels(new_labels, rotation=rotation, fontsize=fontsize)

# Plot the deltas calculated from a given analysis
def plot_deltas(df_list, treatment, y="delta_metric", scale=1.0, cap_outliers=False, save_plot=False, loc=0, **kwargs):
    for df in df_list:
        metric = df.metric.unique()[0]
        if df[treatment].dtype == float:
            df[treatment] = df[treatment].round(5)
        temp_df = df.query(test_pred)
        if cap_outliers:
            temp_df = df.assign(
                **{f"{y}": lambda df: df[y].clip(lower=np.percentile(df[y], 5), upper=np.percentile(df[y], 95))}
            )
        sns.pointplot(
            x=treatment,
            y=y,
            data=temp_df,
            hue="did",
            linestyles="",
            ci=None,
            ax=new_axis(title=f"Delta {metric} (NEW - BASELINE) from LightGBM baseline - Test set", figsize=(22, 10)),
            scale=scale,
        )
        update_pointplot_xlabels(**kwargs)
        plt.legend(loc=loc)
        if save_plot or SAVE_PLOTS_GLOBAL:
            plt.savefig(
                "delta_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            )
        plt.show()

train_pred = "type == 'train'"
test_pred = "type == 'test'"

@fp.curry
def calculate_delta_baseline(did_df, metric="auc_evaluator__target", diff_clause=None):
    diff_clause = ["diff_num_est"] if diff_clause is None else diff_clause
    if isinstance(diff_clause, tuple):
        diff_clause = list(diff_clause)
    """For a given dataframe, calculate the delta of the metric related to the baseline metric,
       which is the metric where diff_column is closer to 0.
       The dataframe is considered as a group from groupby, where each dataframe contains experiment for a given
       dataset id"""
    return did_df.assign(
        baseline_metric=lambda df: df[metric].loc[df.query(test_pred).sort_values(by=diff_clause).index[0]],
        delta_metric=lambda df: df[metric] - df.baseline_metric,
        metric=metric
    ).reset_index(drop=True)

# this is used for multiple experiments
def grouped_experiment(cluster, query_string, name, diff_hyperparam_fn, diff_clause=None):
    diff_cols= (f"diff_{name}") if diff_clause is None else diff_clause
    if isinstance(diff_cols, list):
        diff_cols = tuple(diff_cols)
    delta_baseline_auc_fn = calculate_delta_baseline(metric="auc_evaluator__target", diff_clause=diff_cols)
    delta_baseline_brier_fn = calculate_delta_baseline(metric="brier_score_evaluator__target", diff_clause=diff_cols)
    delta_baseline_logloss_fn = calculate_delta_baseline(metric="logloss_evaluator__target", diff_clause=diff_cols)
    grouped_all_dids = (
        cluster.query(query_string)
        .pipe(diff_hyperparam_fn)
        .groupby("did")
    )

    auc_deltas = grouped_all_dids.apply(delta_baseline_auc_fn)
    brier_deltas = grouped_all_dids.apply(delta_baseline_brier_fn)
    logloss_deltas = grouped_all_dids.apply(delta_baseline_logloss_fn)
    deltas_df_list = [auc_deltas, brier_deltas, logloss_deltas]
    print(auc_deltas.shape)
    return deltas_df_list

# used to convert a dictionary with the p-values into a dataframe
def build_statistics_df(statistical_dict):
    overall_statistics_df = pd.DataFrame()
    for treatment, metric_d in statistical_dict.items():
        treatment_df = pd.DataFrame()
        for metric, statistical_results_d in metric_d.items():
            treatment_df = pd.concat(
                (
                    treatment_df,
                    pd.DataFrame(statistical_results_d, index=[0]).assign(
                        metric=metric[: metric.find("_evaluator__target")]
                    ),
                ),
                ignore_index=True,
            )
        overall_statistics_df = pd.concat(
            (overall_statistics_df, treatment_df.assign(treatment=treatment)), ignore_index=True
        )
    return overall_statistics_df

# used to color red the pvalues < 0.05
def color_negative_red(val):
    color = 'red' if (not isinstance(val, str) and val < 0.05) else 'black'
    return 'color: %s' % color

Analysis of variance - Kruskal-Wallis¶

Before checking the impact of the hyperparameters directly, we'll first test of there's a statistically significant difference in them.

Basically the steps done to check and analyze the data are:

fit a single-factor model on the data, considering the treatment as the hyperparameter being changed. The value is the delta_metric
Plot the residuals of the fitted model in a probability plot (Q-Q plot)
Check for homoscedasticity using Levene's test
"If your data are heteroscedastic, Kruskal–Wallis is no better than one-way anova, and may be worse" - http://www.biostathandbook.com/kruskalwallis.html
Test for normality of the residuals using Shapiro-Wilk test
Perform an ANOVA test. This result is only useful if we guarantee that the ANOVA assumptions HOLD
Perform a non-parametric Kruskal-Wallis test. Since I saw that almost none of the residuals are normality distributed (I don't have enough evidence to say they are), this is the main test to check difference in the median of the rankings of delta_metrics.
If the kruskal-wallis returned a significant p-value, I proceed to run a Conover pairwise posthoc test for the rank significance between the groups themselves. This procedure here is to check for significant disparities between hyperparameter values

def fit_single_factor_model(df, treatment, value_col="value"):
    """ Fit a single-factor model where:
            y_ij = mu + t_i + eps_ij
        The estimates done are:
            mu_pred = mean(Y)
            t_i_pred = mean(Y_i) - mean(Y), for i = 1, 2, ...n
        Intuitively, the overall mean is estimated by the grand average of the observations and
        that any treatment effect is just the difference between the treamtment average and the grand average.
    """
    # treatment must be the the index of the dataframe
    mu_pred = df[value_col].values.mean()
    output_cols = ["treatment_mean", "mu_pred", "tau_i_pred"]
    return (
        df.assign(
            treatment_mean=lambda df: df.groupby(treatment).transform(np.mean)[value_col],
            tau_i_pred=lambda df: df.treatment_mean - mu_pred,
            mu_pred=mu_pred,
        )
        .groupby(treatment)[output_cols]
        .agg("first")
    )

import scipy.stats as stats
import pingouin as pg


def plot_residuals(df, sfm_params, treatment, value_col="value", normalize=True, scipy_plot=False):
    residuals_df = df.merge(right=sfm_params, on=treatment).assign(
        y_treatment_pred=lambda df: df.mu_pred + df.tau_i_pred, residual=lambda df: df[value_col] - df.y_treatment_pred
    )
    if residuals_df.residual.sum() == 0.0:
        print("------ Treatment has no residuals -  Cannot apply Single-factor model -------")
        return residuals_df
    residuals_vector = residuals_df.residual.values    
    if normalize:
        res = (residuals_vector - np.mean(residuals_vector)) / np.std(residuals_vector)
    else:
        res = residuals_vector
    ax = new_axis()
    if scipy_plot:
        stats.probplot(res, plot=ax)
    else:
        pg.qqplot(res, dist="norm", ax=ax)
    metric = df.metric.unique()[0]
    if SAVE_PLOTS_GLOBAL:
        plt.savefig(
                "qqplot_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            )
    return residuals_df

def plot_sfm_model(
    sfm_model,
    treatment,
    metric,
    residuals_width=3,
    num_points_to_highlight=None,
    highlight_size=100,
    lower_is_better=True,
    **kwargs,
):

    fig, ax = plt.subplots(figsize=(20, 10))
    curr_df = sfm_model.reset_index().sort_values(treatment)
    neg_df = curr_df.query("treatment_mean-mu_pred <= 0")
    pos_df = curr_df.query("treatment_mean-mu_pred > 0")
    # the num of points is equivalent of taking the mean of points in each category and getting 25% of the mean number
    num_points_to_highlight = (
        int((pos_df.shape[0] + neg_df.shape[0]) * 0.12) if num_points_to_highlight is None else num_points_to_highlight
    )
    num_points_to_highlight = 5 if num_points_to_highlight <= 0 else num_points_to_highlight
    c1, c2, c3, c4, c5 = sns.color_palette("deep")[:5]
    # categorical data needs to be treated differently
    if pd.api.types.is_categorical_dtype(curr_df[treatment]):
        ax.plot(
            list(curr_df[treatment].values),
            list(curr_df["mu_pred"].values),
            color=c1,
            linewidth=3,
            label="overall mean",
        )
    else:
        ax.plot(curr_df[treatment], curr_df["mu_pred"], color=c1, linewidth=3, label="overall mean")
    ax.bar(
        pos_df[treatment],
        height=pos_df["treatment_mean"] - pos_df["mu_pred"].iloc[0],
        bottom=pos_df["mu_pred"].iloc[0],
        width=residuals_width,
        color=c2,
        edgecolor=None,
        linewidth=0.1,
    )
    ax.bar(
        neg_df[treatment],
        height=neg_df["tau_i_pred"].abs(),
        bottom=neg_df["treatment_mean"],
        width=residuals_width,
        color=c3,
        edgecolor=None,
        linewidth=0.1,
    )
    top_treatment_means = pos_df.sort_values(by="treatment_mean", ascending=False).iloc[:num_points_to_highlight]
    worst_treatment_means = neg_df.sort_values(by="treatment_mean").iloc[:num_points_to_highlight]
    if not top_treatment_means.empty:
        ax.scatter(
            top_treatment_means[treatment],
            top_treatment_means["treatment_mean"],
            s=highlight_size,
            c=np.array(c4)[:, None].T if lower_is_better else np.array(c5)[:, None].T,
            zorder=120,
            label="worst treatments" if lower_is_better else "best treatments",
        )
    if not worst_treatment_means.empty:
        ax.scatter(
            worst_treatment_means[treatment],
            worst_treatment_means["treatment_mean"],
            s=highlight_size,
            c=np.array(c5)[:, None].T if lower_is_better else np.array(c4)[:, None].T,
            zorder=120,
            label="best treatments" if lower_is_better else "worst treatments",
        )
    ax.legend()
    ax.set_xlabel(treatment), ax.set_ylabel("delta_metric"), ax.set_title(f"Single-factor model - {metric}", size=18)
    if kwargs:
        fig.canvas.draw()  # draw the figure before updating the xlabels
        update_pointplot_xlabels(**kwargs)
        if "plot_original" in kwargs:
            ax.set_xticks(curr_df[treatment].values)
            ax.set_xticklabels(curr_df[treatment].values)
    if SAVE_PLOTS_GLOBAL:
        plt.savefig(
                "sfm_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            ) 
    return dict(metric=metric, sfm_df=curr_df)

from IPython.display import display
import scikit_posthocs as sp  # perform posthoc tests


def perform_individual_sfm_analysis(
    df_list,
    treatment,
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    run_sfm_plot=True,
    sfm_plot_args=None,
    statistical_test_dict=None
):
    def index_dataframe(df):
        return (
            df.query(test_pred)
            .reset_index(drop=True)
            .assign(
                delta_metric=lambda df: df.delta_metric.clip(
                    lower=np.percentile(df.delta_metric, 5), upper=np.percentile(df.delta_metric, 95)
                )
            )
        )

    df_list = [index_dataframe(df) for df in df_list]
    sfm_plot_args = {} if sfm_plot_args is None else sfm_plot_args

    print(
        "Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution\n"
    )
    print(
        "Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal"
    )
    sp_heatmap_args = {
        "linewidths": 0.25,
        "linecolor": "0.5",
        "clip_on": False,
        "square": True,
        "cbar_ax_bbox": [0.80, 0.35, 0.04, 0.3],
    }
    sfm_dict = {}

    def loop_over_experiments(function):
        return_results = []
        for df in df_list:
            res = function(df)
            if res is not None:
                return_results.append(res)
        return return_results

    # define analysis to be performed for each dataset
    def residuals(df):
        print(f"Residuals probplot: {df.metric.unique()[0]}")
        res = plot_residuals(
            df=df,
            sfm_params=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric"),
            treatment=treatment,
            value_col="delta_metric",
        )
        plt.show()
        residuals_shapiro = res.residual.values
        if residuals_shapiro.shape[0] > 5000:
            residuals_shapiro = res.residual.sample(n=5000, random_state=42)
        p_shapiro = stats.shapiro(residuals_shapiro)[1]
        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = {"shapiro": p_shapiro}
        print(f"\nShapiro-Wilk p-value = {p_shapiro}. {'REJECT NORMALITY' if p_shapiro <= 0.05 else 'CANNOT REJECT'}")

    def homoscedasticity(df):
        print(f"\nHomoscedasticity - Metric: {df.metric.unique()[0]}")
        hmscd = pg.homoscedasticity(data=df, dv="delta_metric", group=treatment)
        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
                statistical_test_dict[treatment][df.metric.unique()[0]],
                {"levene": hmscd.pval.unique()[0]}
            )
        display(hmscd)

    def anova(df):
        print(f"\nANOVA test - Metric: {df.metric.unique()[0]}")
        display(df.anova(dv="delta_metric", between=treatment, detailed=True))

    def kruskal(df):
        print(f"\nKruskal-wallis test - Metric: {df.metric.unique()[0]}")
        sample = []
        for d_group_idx in df.groupby(treatment).groups.values():
            if len(df.loc[d_group_idx].delta_metric.values) < 2:
                continue
            sample.append(df.loc[d_group_idx].delta_metric.values)
        if len(sample) < 2 or any((len(treatment_group) < 2 for treatment_group in sample)):
              print("--- Not enough Samples in treatment group ---")
              return {}
        print(stats.kruskal(*sample))
        _, kruskal_p = stats.kruskal(*sample)

        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
                statistical_test_dict[treatment][df.metric.unique()[0]],
                {"kruskal": kruskal_p}
            )
        # specific threshold when the group is statistically significant at 95%
        if kruskal_p < 0.05:
            if run_kruskal_posthoc:
                ph_conover = sp.posthoc_conover(df, val_col="delta_metric", group_col=treatment, p_adjust="holm")
                sp.sign_plot(
                    ph_conover,
                    ax=new_axis(figsize=(10, 8), title=f"Conover p-values for {df.metric.unique()[0]}"),
                    **sp_heatmap_args,
                )
            if run_sfm_plot:
                delta_lower_is_better = dict(
                    brier_score_evaluator__target=True, logloss_evaluator__target=True, auc_evaluator__target=False
                )
                updated_sfm_plot_args = fp.merge(
                    sfm_plot_args, dict(lower_is_better=delta_lower_is_better[df.metric.unique()[0]])
                )
                curr_sfm_dict = plot_sfm_model(
                    sfm_model=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric").reset_index(),
                    treatment=treatment,
                    metric=df.metric.unique()[0],
                    **updated_sfm_plot_args,
                )
                return curr_sfm_dict
        return {}

    # apply each analysis to every dataset
    if run_residuals:
        loop_over_experiments(residuals)
    if run_homoscedasticity:
        loop_over_experiments(homoscedasticity)
    if run_anova:
        loop_over_experiments(anova)
    if run_kruskal:
        all_sfm_models = loop_over_experiments(kruskal)
        return all_sfm_models
    return None

from collections import defaultdict
all_cluster_results = defaultdict(dict)

Cluster 1¶

cluster_1 = pickle.load(open("cluster_1.pkl", "rb"))
cluster_1.shape
print("Number of datasets:", len(cluster_1.did.unique()))
curr_cluster = 1

(121670, 13)

Number of datasets: 23

to_remove_1 = [1597, 4154, 40922]
cluster_1 = cluster_1.query("did not in @to_remove_1")

st_tests_1 = defaultdict(dict)

Individual Treatments¶

Num estimators¶

individual_num_est_pred = (
    "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
)


def individual_num_est_diff(df):
    return df.assign(
        # default is 100
        num_estimators=lambda df: df.num_estimators.astype(int),
        diff_num_est=lambda df: abs(df.num_estimators - 100),
    )


num_est_df_list = grouped_experiment(
    cluster=cluster_1, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(800, 17)

plot_deltas(num_est_df_list, treatment="num_estimators")

When changing num_estimators individually, Kruskal-Willis doesn't provide enough evidence to reject null hypothesis

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 4.965993800110704e-19. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 3.7109030667859604e-24. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 6.203025343564139e-23. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=8.597988809017151, pvalue=0.9951831715199703)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=10.657224403362845, pvalue=0.9793032581227489)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=51.4394967779547, pvalue=0.0003733918867836595)

Max depth¶

individual_max_depth_pred = (
    "(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
)


def individual_max_depth_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
    )


max_depth_df_list = grouped_experiment(
    cluster=cluster_1,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(800, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 3.388218626825648e-21. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 8.838670219986008e-15. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 9.113384688824028e-14. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=21.882460441553246, pvalue=0.290145041334565)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=46.52563871369858, pvalue=0.00041740480967144655)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=35.40856429204412, pvalue=0.012465099807488124)

Learning Rate¶

individual_lr_pred = "(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"


def individual_lr_diff(df):
    return df.assign(
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


lr_df_list = grouped_experiment(
    cluster=cluster_1, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(200, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=3, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.002, highlight_size=70, multiple_of=1),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.0133217642760428e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 5.6172714456249537e-14. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 7.958838801761306e-14. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=22.877959309494454, pvalue=0.0287800782055724)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=54.38738027864609, pvalue=2.3299816669512344e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=53.11633590474057, pvalue=3.9276239350538573e-07)

Multiple Treatments¶

When analyzing multiple parameters being run together, the baseline is determined to be the hyperparameter combination with the lowest delta of both of them. Basically, get all the experiments where the diff between hyperparameter a is the minimum, and from those, get the experiment where the hyperparameter b is the minimum, and so on.

Multiple treatments - Utility function

from functools import reduce

def create_multiple_treatment_column(df, columns, treatment_name_output, order_strategy="rank"):
    """When dealing with multiple treatments, this function will create an ordered categorical column,
       which represents a given experiment combination o hyperparameters.
       The ordering can be done by either passing:
           - order_strategy = "rank",
               which means that the ordering is based on the mean of the individual ranks
               of each hyperparameter
           - order_strategy = "column"
               which means that the ordering is based on the sort_values result of the
               'columns' argument.            
       """
    def create_categorical_col(df):
        all_str_cols = [df[col].map(str) for col in columns]
        return df.assign(
            **{
                f"{treatment_name_output}": pd.Series(
                    reduce(lambda x, y: x + " - " + y, all_str_cols), dtype="category"
                )
            }
        )

    def calc_ranked_mean(df):
        # calculate the mean of the ranks of all treatment columns
        all_ranks = [df[col].rank() for col in columns]
        return df.assign(ranked_mean=reduce(lambda x, y: x + y, all_ranks) / len(all_ranks))

    def set_cat_orders(df, ordering_from, categorical_col):
        # set the categories order sorting the dataframe by a given 'ordering_from' column
        ordering_from = list(ordering_from) if isinstance(ordering_from, tuple) else ordering_from
        ord_categories = df.sort_values(by=ordering_from)[categorical_col].unique().astype(str)

        return df.assign(
            **{
                f"{categorical_col}": lambda df: df[categorical_col].cat.reorder_categories(
                    ord_categories, ordered=True
                )
            }
        )

    # the categorical treatment is just a concatenation of all the treatments applied

    if order_strategy is "rank":
        orders_col = "ranked_mean"
        return (
            df.pipe(create_categorical_col)
            .pipe(calc_ranked_mean)
            .pipe(set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output)
        ).drop(columns=orders_col)
    elif order_strategy is "column":
        orders_col = tuple(columns)
        return df.pipe(create_categorical_col).pipe(
            set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output
        )
    else:
        raise ValueError("Invalid ordering strategy! Needs to be 'rank' or 'column'")

Max Depth and Learning Rate¶

max_depth_lr_pred = "(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_max_depth", "diff_lr")


def max_depth_lr_diff(df):
    return df.assign(
        learning_rate=lambda df: df.learning_rate.astype(float),
        max_depth=lambda df: df.max_depth.astype(int),
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
    )


max_depth_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(4000, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.4018849621728907e-32. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.1770907100328463e-43. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 6.678448351125646e-41. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=470.54805773830856, pvalue=6.661712194766211e-08)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=1127.244254064324, pvalue=1.0673657558692864e-90)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1127.9791459804142, pvalue=8.193801894234736e-91)

Max Depth and Num Estimators¶

max_depth_num_est_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
# minimize the difference of max_depth then the num_estimators
diff_clause = ("diff_max_depth", "diff_num_est")


def max_depth_num_est_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        num_estimators=lambda df: df.num_estimators.astype(int),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
    )


max_depth_num_est_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(16000, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=50, rotation=90, loc=1, scale=.8)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=307.1018212402173, pvalue=0.9999999934016638)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=202.3588068667664, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=580.0950520045608, pvalue=0.00010256012318249144)

Num Estimators and Learning Rate¶

num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_num_est", "diff_lr")


def num_est_lr_diff(df):
    return df.assign(
        num_estimators=lambda df: df.num_estimators.astype(int),
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


num_est_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(4000, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=2.5, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=309.33390710727934, pvalue=0.2852538604668392)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=545.8637960194865, pvalue=4.590394996634476e-17)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=691.5428909449221, pvalue=1.0885943750452042e-33)

Max Depth, Num Estimators and Learning Rate¶

max_depth_num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")


def max_depth_num_est_lr_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        num_estimators=lambda df: df.num_estimators.astype(int),
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(80000, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=300, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=6657.281135550946, pvalue=1.0890241542298173e-10)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=10858.797788216523, pvalue=6.880061494337646e-293)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=13186.28648910838, pvalue=0.0)

del cluster_1
gc.collect()

8536061

Cluster 2¶

st_tests_2 = defaultdict(dict)

cluster_2 = pickle.load(open("cluster_2.pkl", "rb"))
cluster_2.shape
print("Number of datasets:", len(cluster_2.did.unique()))
curr_cluster = 2

(31740, 13)

Number of datasets: 6

to_remove_2 = [40517]
cluster_2 = cluster_2.query("did not in @to_remove_2")

Individual treatments¶

Num estimators¶

num_est_df_list = grouped_experiment(
    cluster=cluster_2, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(200, 17)

plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3, save_plot=True)

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 4.0599200445967654e-08. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 5.942564257566119e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.502680399629753e-05. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=4.679600286941011, pvalue=0.9996505615150002)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.4674407373969265, pvalue=0.9965571834951797)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=40.48131129638574, pvalue=0.002827827988640876)

Max depth¶

max_depth_df_list = grouped_experiment(
    cluster=cluster_2,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(200, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.536182097667705e-10. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 3.926060799130937e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.5221535906894132e-05. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=33.954404013077685, pvalue=0.01860650187092958)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=33.19283069085734, pvalue=0.02283746743609913)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=34.1495018211372, pvalue=0.0176453054611716)

Learning Rate¶

lr_df_list = grouped_experiment(
    cluster=cluster_2, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(50, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.714980462670269e-09. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 9.808684531265044e-09. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.2135402405188955e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:2352: RuntimeWarning: invalid value encountered in double_scalars
  W = numer / denom

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/pingouin/parametric.py:993: RuntimeWarning: invalid value encountered in double_scalars
  mserror = sserror / ddof2

ANOVA test - Metric: brier_score_evaluator__target

Multiple Treatments¶

Max Depth and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(1000, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=10, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 1.406702218796685e-20. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.3399940012604214e-23. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.334959806507663e-23. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=155.85228427854736, pvalue=0.013169797926850337)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=255.09139169126348, pvalue=6.235464053873864e-12)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=226.75563438842394, pvalue=1.005024961982302e-08)

Max Depth and Num Estimators¶

diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(4000, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2    
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.548461728664323e-22. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 6.49431955042634e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 3.9740927216797714e-21. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=240.12035494493966, pvalue=0.9999999999778888)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=358.61014295798583, pvalue=0.9274815591224476)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=731.9774069460274, pvalue=6.191509692035732e-22)

Num Estimators and Learning Rate¶

diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(1000, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 9.960932013158139e-27. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.7090848584434127e-26. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.452305074789782e-24. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=179.62919106428052, pvalue=3.3063320135159447e-06)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=204.76897786258806, pvalue=7.035798060397642e-09)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=172.40807068535076, pvalue=1.6503038404775902e-05)

Max Depth, Num Estimators and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(20000, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=3339.498345738567, pvalue=2.9041723270259444e-64)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=3844.687582875137, pvalue=4.013154847570665e-111)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=3405.4581385816687, pvalue=7.298815542361538e-70)

del cluster_2
gc.collect()

2437679

Cluster 3¶

cluster_3 = pickle.load(open("cluster_3.pkl", "rb"))
cluster_3.shape
print("Number of datasets:", len(cluster_3.did.unique()))
curr_cluster = 3

(37122, 13)

Number of datasets: 9

st_tests_3 = defaultdict(dict)

Individual treatments¶

Num estimators¶

num_est_df_list = grouped_experiment(
    cluster=cluster_3, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(288, 17)

plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 5.611897678025159e-16. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.0502217978890513e-10. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 8.273422225091309e-11. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=26.583542474712893, pvalue=0.11473383075040258)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=27.469630833080444, pvalue=0.09418066432195568)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=22.36772110694076, pvalue=0.2663393451829624)

Max depth¶

max_depth_df_list = grouped_experiment(
    cluster=cluster_3,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(284, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 3.1393963695336424e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 3.959431705879979e-05. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0006005177274346352. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=54.72922010202657, pvalue=2.5569975651630184e-05)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=59.7876087606679, pvalue=4.180864080550115e-06)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=60.98327620987113, pvalue=2.7021742961013124e-06)

Learning Rate¶

lr_df_list = grouped_experiment(
    cluster=cluster_3, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(82, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_3

)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.36764296889305115. CANNOT REJECT
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.005871765315532684. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.019388489425182343. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=32.16855005115718, pvalue=1.767195777354127e-06)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=32.44497030597138, pvalue=1.5515294888437792e-06)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=32.38209555818483, pvalue=1.598157733407212e-06)

Multiple Treatments¶

Max Depth and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(1412, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=5, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=5, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 8.839740869631338e-14. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 5.927691972829052e-07. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 3.8707453908770617e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=287.8379836594564, pvalue=2.5342539943254588e-20)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=459.3187148304025, pvalue=8.655282647472749e-48)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=481.2119106630078, pvalue=1.441530528707626e-51)

Max Depth and Num Estimators¶

diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(5608, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 3.996993674716892e-40. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.490148176916481e-35. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.679124678037066e-35. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=761.3073330765131, pvalue=6.177611536713735e-25)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=731.4999998339297, pvalue=6.91110841192281e-22)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1328.3887078634368, pvalue=3.001167872734788e-100)

Num Estimators and Learning Rate¶

diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(1424, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=5, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=2, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 8.22391056663302e-28. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 4.965566242157312e-27. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.4469581259610162e-34. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=279.4335305709853, pvalue=4.0847323931351827e-19)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=305.4636680654049, pvalue=6.548453243989413e-23)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=317.38372404030315, pvalue=1.0634544914481051e-24)

Max Depth, Num Estimators and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(28024, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=3880.881000478823, pvalue=2.8474863833491625e-123)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=3903.9984289943445, pvalue=1.0169721378366844e-125)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=6768.312744198603, pvalue=0.0)

del cluster_3
gc.collect()

688354

Cluster 4¶

cluster_4 = pickle.load(open("cluster_4.pkl", "rb"))
cluster_4.shape
print("Number of datasets:", len(cluster_4.did.unique()))
curr_cluster = 4

(26450, 13)

Number of datasets: 5

st_tests_4 = defaultdict(dict)

Individual treatments¶

Num estimators¶

num_est_df_list = grouped_experiment(
    cluster=cluster_4, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(200, 17)

plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 3.616501087315427e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.482371946888051e-13. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.3367034909506081e-11. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.620332212345332, pvalue=0.2111212183222945)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.3121786636248896, pvalue=0.9970747844229544)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=8.771339376838563, pvalue=0.9770873121481394)

Max depth¶

max_depth_df_list = grouped_experiment(
    cluster=cluster_4,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(200, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.5372391687650264e-11. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.0192414947596262e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 6.850467970664909e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=8.58273429135731, pvalue=0.9797862535391266)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.669608838751563, pvalue=0.9957777576843745)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=3.894313374853578, pvalue=0.9999135832022281)

Learning Rate¶

lr_df_list = grouped_experiment(
    cluster=cluster_4, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(50, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_4

)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.831652696182573e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 3.6365166433682816e-09. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 3.6365166433682816e-09. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:2352: RuntimeWarning: divide by zero encountered in double_scalars
  W = numer / denom

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=0.0, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=2.666666666666665, pvalue=0.10247043485974916)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=2.666666666666665, pvalue=0.10247043485974916)

Multiple Treatments¶

Max Depth and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(1000, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=15, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 9.831066270845763e-33. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 9.176373442839894e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.758600865786558e-30. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=90.78502869442515, pvalue=0.7098868380095777)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=212.43190290430337, pvalue=2.899170199132208e-10)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=201.48393260616163, pvalue=5.5482956205515515e-09)

Max Depth and Num Estimators¶

diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(4000, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 3.372528836164431e-39. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 4.203895392974451e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=339.2970243268268, pvalue=0.986281599326002)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=97.55001096543852, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=154.25820639226663, pvalue=1.0)

Num Estimators and Learning Rate¶

diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(1000, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=20, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 1.4393615174123851e-30. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.2179191268417695e-29. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 2.4045491119171254e-26. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=24.00130169163542, pvalue=0.9999999999999996)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=198.99178651367794, pvalue=1.0663779734024478e-08)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=202.30317593403936, pvalue=4.4689893467934435e-09)

Max Depth, Num Estimators and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(20000, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=200, cap_outliers=True, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=200, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=520.7314796215171, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=4002.1631696522404, pvalue=2.812654964692408e-136)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=4155.8783421483795, pvalue=2.5010470783739023e-153)

del cluster_4
gc.collect()

1729047

Cluster 5¶

cluster_5 = pickle.load(open("cluster_5.pkl", "rb"))
cluster_5.shape
print("Number of datasets:", len(cluster_5.did.unique()))
curr_cluster = 5

(63480, 13)

Number of datasets: 12

st_tests_5 = defaultdict(dict)

Individual treatments¶

Num estimators¶

num_est_df_list = grouped_experiment(
    cluster=cluster_5, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(480, 17)

plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=10, rotation=90)

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.727655883279223e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.3163783391400017e-11. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 5.203109609830392e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.034257944485756, pvalue=0.813848923944241)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=7.55816240299047, pvalue=0.9999895813836994)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=5.545052419855335, pvalue=0.9999997464184317)

Max depth¶

max_depth_df_list = grouped_experiment(
    cluster=cluster_5,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(480, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 1.0566748492011704e-27. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 5.6355290918896e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.1003471403235982e-27. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=21.125166655182287, pvalue=0.3299261830545254)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=50.18929982045118, pvalue=0.00012291164120722053)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=35.32664373843109, pvalue=0.01275361758982555)

Learning Rate¶

lr_df_list = grouped_experiment(
    cluster=cluster_5, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(120, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.003, highlight_size=200, multiple_of=5),
    statistical_test_dict=st_tests_5

)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 6.518039688714616e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 3.448694644745884e-14. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.1609338707377348e-13. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=12.001207729468597, pvalue=0.017342286509862016)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=16.94166666666667, pvalue=0.0019840472937602757)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=16.76893939393939, pvalue=0.0021432893857816653)

Multiple Treatments¶

Max Depth and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(2400, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=50, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 2.3715745229534633e-30. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.0799779238582079e-39. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 3.393346335573849e-36. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=563.1634405126533, pvalue=3.2814490033108474e-28)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=359.70394069600377, pvalue=7.005511102826335e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=357.0345473723235, pvalue=1.1157110518754798e-06)

Max Depth and Num Estimators¶

diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(9600, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 2.802596928649634e-45. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 4.049752561898721e-43. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=308.1723963216093, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=270.7711345687177, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=291.8148922312212, pvalue=1.0)

Num Estimators and Learning Rate¶

diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(2400, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=335.8833313447127, pvalue=1.6632132589091608e-08)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=409.0136802114733, pvalue=7.634533258312381e-16)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=405.9054563728135, pvalue=1.6836721419703596e-15)

Max Depth, Num Estimators and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(48000, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=450, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=6230.126585556639, pvalue=1.2520961615352599e-92)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=7971.974493586129, pvalue=1.0950526833361547e-251)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=8169.19716833423, pvalue=8.787394265494155e-273)

del cluster_5
gc.collect()

6168473

Cluster 6¶

cluster_6 = pickle.load(open("cluster_6.pkl", "rb"))
cluster_6.shape
print("Number of datasets:", len(cluster_6.did.unique()))
curr_cluster = 6

(79350, 13)

Number of datasets: 15

to_remove_6 = [1069]
cluster_6 = cluster_6.query("did not in @to_remove_6")

st_tests_6 = defaultdict(dict)

Individual treatments¶

Num estimators¶

num_est_df_list = grouped_experiment(
    cluster=cluster_6, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)

(560, 17)

plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3)

all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 5.555299465885219e-15. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 4.446019561624853e-13. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 5.699214175271987e-15. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=50.85164651209711, pvalue=9.810169022173583e-05)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=68.8504365366667, pvalue=1.426687289385142e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=124.9472351169389, pvalue=1.3096694700133999e-17)

Max depth¶

max_depth_df_list = grouped_experiment(
    cluster=cluster_6,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)

(560, 17)

plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)

all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 8.323186449279008e-18. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 8.587675795781614e-18. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 8.382107814712916e-16. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.225483683108557, pvalue=0.22755618937135969)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=31.308793668543096, pvalue=0.03733077394442036)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=24.609169025374687, pvalue=0.1738129806878052)

Learning Rate¶

lr_df_list = grouped_experiment(
    cluster=cluster_6, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)

(140, 17)

plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)

all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_6

)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 8.634466590619656e-17. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.0. CANNOT REJECT
Residuals probplot: logloss_evaluator__target

/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:1657: UserWarning: Input data for shapiro has range zero. The results may not be accurate.
  warnings.warn("Input data for shapiro has range zero. The results "

Shapiro-Wilk p-value = 5.394156741626009e-16. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

ANOVA test - Metric: auc_evaluator__target

ANOVA test - Metric: brier_score_evaluator__target

ANOVA test - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: brier_score_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: logloss_evaluator__target
--- Not enough Samples in treatment group ---

Multiple Treatments¶

Max Depth and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)

(2800, 18)

plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)

all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 8.691273465281813e-41. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 7.470919066461402e-39. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 1.5974802493302915e-43. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=343.5288011629896, pvalue=0.0003351862674642278)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=444.4501908648193, pvalue=6.020948338601525e-12)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=480.2402467089576, pvalue=1.947958201110195e-15)

Max Depth and Num Estimators¶

diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)

(11200, 18)

plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=20, rotation=90)

all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 6.459985920537407e-43. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=308.4987603835679, pvalue=0.9997299289361279)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=979.7819367341262, pvalue=1.0145057100151009e-50)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1925.6932756193842, pvalue=5.38352391645615e-198)

Num Estimators and Learning Rate¶

diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)

(2800, 18)

plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)

all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 1.9165559096570523e-41. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=305.84677365628073, pvalue=0.0025754266802920523)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=312.90593258396194, pvalue=0.00107064557981089)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=354.1822196959263, pvalue=2.237852042493134e-06)

Max Depth, Num Estimators and Learning Rate¶

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)

(56000, 19)

plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)

all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)

Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target

Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target

Homoscedasticity - Metric: brier_score_evaluator__target

Homoscedasticity - Metric: logloss_evaluator__target

Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=7851.294316177966, pvalue=2.705704100964987e-150)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=4966.176739551213, pvalue=0.06798990455329582)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=7007.331698082554, pvalue=6.851410919346193e-86)

del cluster_6
gc.collect()

4991574

Significance proportion¶

# open("all_cluster_results.pkl", "wb").write(cloudpickle.dumps(all_cluster_results))
all_cluster_results = cloudpickle.load(open("all_cluster_results.pkl", "rb"))
all_cluster_results.keys()

dict_keys([1, 2, 3, 4, 5, 6])

### mapping dicts
hyperparam_combination_map = dict(
    num_estimators="NE",
    max_depth="MD",
    learning_rate="LR",
    max_depth_lr="MD, LR",
    num_est_lr="LR, NE",
    max_depth_num_est="MD, NE",
    max_depth_num_est_lr="NE, MD, LR",
)

perf_metric_map = dict(auc="AUC", brier="Brier", logloss="Logloss")

### ordering dicts
hyperparam_combination_order = dict(
    num_estimators=1,
    max_depth=2,
    learning_rate=3,
    max_depth_lr=4,
    max_depth_num_est=5,
    num_est_lr=6,
    max_depth_num_est_lr=7,
)

perf_metric_order = dict(auc=1, brier=2, logloss=2)

Hyperparmeters Significance Proportion¶

def created_filtered_hyperparam_dict(clusters_results_d, key="num_estimators"):
    results = {}
    for cluster, c_dict in clusters_results_d.items():    
        curr_filt_dict = fp.keyfilter(lambda el: el == key, c_dict)
        if curr_filt_dict[key] !=  [{}, {}, {}]:
            results[cluster] = curr_filt_dict
    return results

def count_significance(filt_hp_dict, key="num_estimators", metric=None, normalize_by=1):
    if metric is None:
        metric = defaultdict(int)
    for hp_d in filt_hp_dict.values():
        for d in hp_d[key]:
            metric[d.get("metric", "NaN")] += 1
    
    return fp.valmap(lambda v: (v/normalize_by)*100, metric)
            
# filt_ne_dict = created_filtered_hyperparam_dict(all_cluster_results, key="num_estimators")
# each num_est analysis have all three metrics. dividing the count of each metric by six will give us the proportion of
# how much that metric was considered significant according to kruskll-wallis
# count_significance(filt_ne_dict, key="num_estimators", normalize_by=6)

for key in sorted(all_cluster_results[1].keys(), key=lambda x: x.count("_")):
    print(f"-------- {key} by metric proportion ---------")
    count_significance(
        created_filtered_hyperparam_dict(all_cluster_results, key=key),
        key=key, normalize_by=6
    )

-------- num_estimators by metric proportion ---------

{'NaN': 66.66666666666666,
 'logloss_evaluator__target': 50.0,
 'auc_evaluator__target': 16.666666666666664,
 'brier_score_evaluator__target': 16.666666666666664}

-------- max_depth by metric proportion ---------

{'NaN': 66.66666666666666,
 'brier_score_evaluator__target': 83.33333333333334,
 'logloss_evaluator__target': 66.66666666666666,
 'auc_evaluator__target': 33.33333333333333}

-------- learning_rate by metric proportion ---------

{'auc_evaluator__target': 50.0,
 'brier_score_evaluator__target': 50.0,
 'logloss_evaluator__target': 50.0}

-------- max_depth_lr by metric proportion ---------

{'auc_evaluator__target': 83.33333333333334,
 'brier_score_evaluator__target': 100.0,
 'logloss_evaluator__target': 100.0,
 'NaN': 16.666666666666664}

-------- num_est_lr by metric proportion ---------

{'NaN': 33.33333333333333,
 'brier_score_evaluator__target': 100.0,
 'logloss_evaluator__target': 100.0,
 'auc_evaluator__target': 66.66666666666666}

	num_rows	num_features	mean_variance	mean_skewness	num_categorical	sum_cardinality_over_categorical	categorical_ratio	numeric_ratio	boolean_ratio	constant_ratio
did
3	3196.0	37.0	0.187118	0.000000	37.0	1.947368	1.000000	0.000000	0.000000	0.0
31	1000.0	21.0	0.487300	0.920379	14.0	3.733333	0.666667	0.285714	0.047619	0.0
44	4601.0	58.0	0.222751	11.186639	0.0	0.000000	0.000000	0.982759	0.017241	0.0
72	1000000.0	37.0	0.191597	0.000000	37.0	1.947368	1.000000	0.000000	0.000000	0.0
73	1000000.0	17.0	0.301100	0.000000	17.0	2.500000	1.000000	0.000000	0.000000	0.0

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est	0.033	2179	0.0	2.232	2.9517e-125	0.455
1	Within	0.039	5820	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est	0.147	2179	0.0	8.318	0	0.757
1	Within	0.047	5820	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est	11.851	2179	0.005	2.631	1.94873e-183	0.496
1	Within	12.031	5820	0.002	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est_lr	0.709	25059	0.0	1.847	0	0.756
1	Within	0.229	14940	0.0	-	-	-

	learning_rate	seed	nthread	verbose	training_time	auc_evaluator__target	logloss_evaluator__target	brier_score_evaluator__target	type	num_estimators	max_depth	num_leaves
0	0.033333	42	32	-1	4.085 s	0.980014	0.293893	0.071260	train	NaN	NaN	NaN
1	0.033333	42	32	-1	4.085 s	0.980117	0.293825	0.071240	test	NaN	NaN	NaN
2	0.050000	42	32	-1	3.966 s	0.980424	0.228336	0.053087	train	NaN	NaN	NaN
3	0.050000	42	32	-1	3.966 s	0.980523	0.228265	0.053073	test	NaN	NaN	NaN
4	0.100000	42	32	-1	4.455 s	0.985209	0.157624	0.040455	train	NaN	NaN	NaN

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est_lr	3.111	25059	0.0	5.444	0	0.901
1	Within	0.341	14940	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	max_depth_num_est_lr	130.746	25059	0.005	6.081	0	0.911
1	Within	12.818	14940	0.001	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.002	4	0.0	9.136	3.35471e-05	0.504
1	Within	0.002	36	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.004	4	0.001	21.993	2.96896e-09	0.71
1	Within	0.001	36	0.000	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.059	4	0.015	16.643	8.28258e-08	0.649
1	Within	0.032	36	0.001	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.001	22	0.0	27.531	0.0356127	0.997
1	Within	0.000	2	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.061	22	0.003	541.364	0.00184533	1
1	Within	0.000	2	0.000	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.591	22	0.027	191.857	0.00519743	1
1	Within	0.000	2	0.000	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.020	44	0.0	5.1	0.00066276	0.937
1	Within	0.001	15	0.0	-	-	-

	Source	SS	DF	MS	F	p-unc	np2
0	learning_rate	0.248	44	0.006	6.018	0.000244743	0.946
1	Within	0.014	15	0.001	-	-	-