Imports

In [23]:
from IPython.core.interactiveshell import InteractiveShell #display full output instead of just the last one
InteractiveShell.ast_node_interactivity = "all"
In [24]:
import toolz as fp
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cloudpickle
import gc
sns.set()
%matplotlib inline
pd.set_option('display.max_rows', 50)
palette = sns.hls_palette(8, h=.7, s=.9, l=0.5)
sns.palplot(palette[::-1])
sns.set_palette(palette[::-1])

gray = "444444"

plt.rcParams['figure.facecolor'] = '1'  # background color
plt.rcParams['axes.facecolor'] = '1'  # plot background color

plt.rcParams['grid.color'] = '0.8'
plt.rcParams['grid.alpha'] = .4

plt.rcParams['axes.edgecolor'] = '1'

plt.rcParams['lines.linewidth'] = 1
plt.rcParams['grid.linestyle'] = '-'

plt.rcParams['axes.axisbelow'] = True
plt.rcParams['axes.labelcolor'] = gray

plt.rcParams['text.color'] = gray

plt.rcParams['xtick.color'] = gray
plt.rcParams['ytick.color'] = gray
sns.set_style("whitegrid")
sns.set_context("notebook")
%config InlineBackend.figure_format = "retina"
In [3]:
import pickle
from pathlib import Path

class dotdict(dict):
    """dot.notation access to dictionary attributes"""

    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
@fp.curry
def open_pickled_object(file, full_path):
    """
    Open a pickled object by specifying the full_path and then the filename
    Parameters
    ----------
    file : str
        String representing the filename
    full_path : int
        String representing the path to the file
    """
    return pickle.load(open(full_path + "/" + file, "rb"))
In [4]:
def open_experiment(did, model_type="BINARY"):
    """
    Open the experiment files from a single dataset by providing the ID
    Parameters
    ----------
    did : int
        the dataset id
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../"+model_type
    full_path = base_path + "/" + str(did)
    open_object = open_pickled_object(full_path=full_path)
    if Path(base_path).is_dir() and Path(full_path).is_dir():
        return dotdict(
            dict(
                analyzer_info=open_object("analyzer_info.pkl"),
                df_stats=open_object("df_stats.pkl"),
                final_result=open_object("final_result.pkl"),
                hp_tree=open_object("hp_tree.pkl"),
                openml_object=open_object("openml_object.pkl"),
                shape=open_object("shape.pkl"),
            )
        )
    return {}
In [5]:
def get_all_available_experiments(model_type="BINARY"):
    """
    Return a list of the datasets ID's available by reading the folder
    Parameters
    ----------
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../" + model_type
    path = Path(base_path)
    return [int(x.name) for x in path.iterdir() if x.is_dir()]


def new_axis(figsize=(15, 8), title=None):
    """
    Creates a new matplotlib axis and returns it
    Parameters
    ----------
    figsize : Tuple(Numeric, Numeric)
        the figure size
    title : str
        the figure title
    """
    _, ax = plt.subplots(figsize=figsize)
    if title is not None:
        ax.set_title(title, size=17)
    return ax
In [6]:
def build_experiment_stats(did_list, model_type="BINARY"):
    """
    Return the meta-statistics from the provided datasets ids
    Parameters
    ----------
    did_list : List[int]
        list of dataset ids
    model_type : str
        String representing the model type. Currently only BINARY is supported
    """
    base_path = "../" + model_type
    if not Path(base_path).is_dir():
        raise ValueError("Base directory doesn't exist")
    stats_df = pd.DataFrame()
    open_stats = open_pickled_object(file="df_stats.pkl")
    for did in did_list:
        full_path = base_path + "/" + str(did)
        full_path_posix = Path(base_path + "/" + str(did))
        if full_path_posix.is_dir():
            stats_df = pd.concat(
                (stats_df,
                 open_stats(full_path=full_path).T.assign(did=did)
                 )
            )
    return stats_df

Open an individual experiment

A single experiment containts all the results of changing the hyperparameters, along with the meta-statistics for a given dataset. The original dataset and information is stored in the openml_object

In [8]:
experiment = open_experiment(did=72)
experiment.keys()
Out[8]:
dict_keys(['analyzer_info', 'df_stats', 'final_result', 'hp_tree', 'openml_object', 'shape'])
In [9]:
experiment.openml_object.name
Out[9]:
'BNG(kr-vs-kp)'

Important way to select the experiments:

If a given factor is NaN in the dataframe, it means that in that specific experiment run, that parameter was't changed!

I'm analyzing three parameters: max_depth, num_estimators and learning_rate. So if want to get the dataframe of the experiments where the only factor analyzed is the max_depth for example, I get all rows where max_depth is not NaN but num_estimators and learning_rate are.

In [7]:
def build_result_dataframe(log_list):
    """
    Given a log with all experiment results of a given dataset, this function returns the dataset concatenated
    Parameters
    ----------
    log_list : List[dict]
        a list of experiment logs
    """
    all_experiments = []
    for base_result in log_list:
        train_key, test_key = "train_result", "test_result"
        base_df = pd.DataFrame(fp.dissoc(base_result, train_key, test_key), index=range(2))
        metrics_df = pd.concat((pd.DataFrame(base_result[train_key], index=[0]).assign(type="train"),
                                pd.DataFrame(base_result[test_key], index=[0]).assign(type="test"))).reset_index(drop=True)
        all_experiments.append(pd.concat((base_df, metrics_df), axis=1))
    return pd.concat((exp for exp in all_experiments), sort=False, ignore_index=True)
In [11]:
did_3_results = build_result_dataframe(experiment.final_result)
did_3_results.shape
did_3_results.head()
Out[11]:
(5290, 12)
Out[11]:
learning_rate seed nthread verbose training_time auc_evaluator__target logloss_evaluator__target brier_score_evaluator__target type num_estimators max_depth num_leaves
0 0.033333 42 32 -1 4.085 s 0.980014 0.293893 0.071260 train NaN NaN NaN
1 0.033333 42 32 -1 4.085 s 0.980117 0.293825 0.071240 test NaN NaN NaN
2 0.050000 42 32 -1 3.966 s 0.980424 0.228336 0.053087 train NaN NaN NaN
3 0.050000 42 32 -1 3.966 s 0.980523 0.228265 0.053073 test NaN NaN NaN
4 0.100000 42 32 -1 4.455 s 0.985209 0.157624 0.040455 train NaN NaN NaN

Analyze impact of hyperparameters

We can look at each dataset to check the results of different parameters.

Isolated Impact

TIP: Using Panda's .query() method, we can use col == col to filter when col isn't NaN, and col != col to filter when col IS NaN.

Num_estimators

In [489]:
_, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 10))
axes = axes.ravel()
res_num_est = did_3_results.query(
    "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
).assign(num_estimators=lambda df: df.num_estimators.astype(int))
for ax_index, metric in enumerate(
    ("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")
):
    sns.pointplot(x="num_estimators", y=metric, hue="type", data=res_num_est, ax=axes[ax_index])
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
# ax.set_title("Num_estimators impact on AUC", size=17);
plt.tight_layout();

max_depth

In [474]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_max_depth = did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)").assign(max_depth=lambda df: df.max_depth.astype(int))
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
    sns.pointplot(
        x="max_depth",
        y=metric,
        hue="type",
        data=res_max_depth,
        ax=axes[ax_index]
    )
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();

learning_rate

In [67]:
_, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 5))
axes = axes.ravel()
res_lr = did_3_results.query("(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)")
for ax_index, metric in enumerate(("auc_evaluator__target", "brier_score_evaluator__target", "logloss_evaluator__target")):
    sns.pointplot(
        x="learning_rate",
        y=metric,
        hue="type",
        data=res_lr,
        ax=axes[ax_index]
    )
axes[0].set_title("AUC")
axes[1].set_title("Brier Score")
axes[2].set_title("Logloss")
plt.axis('off')
plt.tight_layout();

Joint Impact

There are multiple ways to analyze the joint impact of this. In this case I'm showing the full joint distribution, i.e. varing every possible hyperparameter

In [91]:
kk = (
        did_3_results.query(
            "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
        ).assign(
            num_estimators=lambda df: df.num_estimators.astype("int"),
            max_depth=lambda df: df.max_depth.astype("int"),
            learning_rate=lambda df: df.learning_rate.round(4),
        ).query("not(learning_rate == 0.3 & max_depth == 11)")
    )
In [22]:
g = sns.catplot(
    x="num_estimators",
    y="auc_evaluator__target",
    col="learning_rate",
    hue="max_depth",
    kind="point",
    data=did_3_results.query(
        "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate) & (type == 'test')"
    ).assign(
        num_estimators=lambda df: df.num_estimators.astype("int"),
        max_depth=lambda df: df.max_depth.astype("int"),
        learning_rate=lambda df: df.learning_rate.round(4),
    ),
    col_wrap=2,
    height=3.8,
    aspect=2,
    sharey=False,
    markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=18)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=18)
plt.gcf().suptitle("All Hyperparameters", size=17, y=1.02);
# plt.tight_layout();
In [24]:
g = sns.catplot(
    x="num_estimators",
    y="auc_evaluator__target",
    col="type",
    hue="max_depth",
    kind="point",
    data=(
        did_3_results.query(
            "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
        ).assign(
            num_estimators=lambda df: df.num_estimators.astype("int"),
            max_depth=lambda df: df.max_depth.astype("int")
        )
    ),
    col_wrap=1,
    height=3.8,
    aspect=2,
    sharey=False,
    markers="_",
)
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - num_estimators x max_depth", size=17, y=1.02);
In [28]:
g = sns.catplot(x="learning_rate", y="auc_evaluator__target", col="type", hue="max_depth", kind="point",
            data=(did_3_results.query("(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)")
                  .assign(max_depth=lambda df: df.max_depth.astype("int"), learning_rate=lambda df: df.learning_rate.round(4))),
            col_wrap=1, height=3.8, aspect=2, sharey=False, markers="_")
g.fig.canvas.draw()
for ax in g.axes:
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=16)
plt.gcf().suptitle("AUC - learning_rate x max_depth", size=17, y=1.02);

All experiments: meta-statistics analysis

The meta-statistics are statistics calculated directly from information about the structure of the dataset and the features. They're not related to a machine learning model, but only to a static dataset. Specifically, they're calculated for each feature of the dataset, and encompass things like variable type, skewness of the data, cardinality of categorical features, etc.

In [8]:
all_dids = get_all_available_experiments()
print(f"Number of available datasets: {len(all_dids)}")
experiment_statistics = build_experiment_stats(all_dids);
Number of available datasets: 70
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/ipykernel_launcher.py:22: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

There's a high number of Categorical features in OPENML datasets. Probably due to the bioinfo datasets which contains a lot of categorical data

In [9]:
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Feature type distribution", fontsize=17)
experiment_statistics.groupby("var_type").count().top.rename("count").sort_values().plot(kind="barh", ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=17);

Most of the categorical features are binary

In [10]:
cardinality = experiment_statistics[["cardinality"]].query("cardinality != '--'").dropna().astype(int)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Cardinality count of categorical features", fontsize=17)
cardinality.assign(
    cardinality=lambda df: df.cardinality.clip(upper=np.percentile(df.cardinality, 95))
).cardinality.value_counts().sort_index().plot(kind="bar", ax=ax);
fig.canvas.draw()  # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);

Most of the numerical features have 0 skewness, but a lower percentage have higher skewness

In [11]:
skewness = experiment_statistics[["skewness"]].query("skewness != '--'").dropna().astype(float)
fig, ax = plt.subplots(figsize=(17, 10))
ax.set_title("Skewness distribution")
skewness.assign(skewness=lambda df: df.skewness.clip(upper=np.percentile(df.skewness, 95))).plot.hist(
    ax=ax,
);
fig.canvas.draw()  # draw the figure before updating the xlabels
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=17);

Aggregated meta-statistics by dataset

To represent a dataset I need to aggregate the feature-wise meta-statistics for each dataset into "aggregated meta-statistics". This way we can represent a dataset as a point in this high dimensional space, and compare points (datasets) together. The aggregated features are calculated on the did_agg function, and are basically ratio of feature types, number of features, mean of skewness, etc

OBS: variance is defined as: 1 - (v / float(df.shape[0])), where v is the number of rows the most common value of a given feature appears. 0 is usually bad.

This is not the typical statistical variance, but rather a measure of how much diversity of values a categorical feature has in the dataset

In [12]:
def did_agg(did_df):
    d = {}
    did_skewness = did_df.query("skewness != '--'").skewness.dropna().astype(float)
    d["num_rows"] = did_df["count"].iloc[0]
    d["num_features"] = did_df.T.columns.shape[0]
    d["mean_skewness"] = 0.0 if did_skewness.empty else did_skewness.mean()
    d["mean_variance"] = did_df.query("variance != '--'").variance.mean()
    d["num_categorical"] = did_df["var_type"].isin(["Categorical"]).sum()
    d["sum_cardinality_over_categorical"] = did_df.query(
        "cardinality == cardinality & cardinality != '--'"
    ).cardinality.astype(int).sum() / (d["num_categorical"] + 1)
    # feature type count
    d["categorical_ratio"] = d["num_categorical"] / d["num_features"]    
    d["numeric_ratio"] = did_df["var_type"].isin(["Numeric"]).sum() / d["num_features"]    
    d["boolean_ratio"] = did_df["var_type"].isin(["Boolean"]).sum() / d["num_features"]    
    d["constant_ratio"] = did_df["var_type"].isin(["Constant"]).sum() / d["num_features"]    

    return pd.Series(
        d,
        index=[
            "num_rows",
            "num_features",
            "mean_variance",
            "mean_skewness",
            "num_categorical",
            "sum_cardinality_over_categorical",
            "categorical_ratio",
            "numeric_ratio",
            "boolean_ratio",
            "constant_ratio",
        ],
    )

I'll cap the aggregated columns for visualizations purposes.

Example of capper - by the 95th percentile

In [13]:
np.percentile(pd.Series(np.arange(100)), 95)
Out[13]:
94.05

Scatter matrix plot

Checking for interactions between the aggregated meta-statistics

In [14]:
aggregated_stats = experiment_statistics.groupby("did").apply(did_agg)
capped_agg_stats = aggregated_stats.assign(
    **{f"{col}": lambda df, i=col: df[i].clip(upper=np.percentile(df[i], 90)) for col in aggregated_stats.columns}
)

pd.plotting.scatter_matrix(
    capped_agg_stats, ax=new_axis(title="Scatter matrix for aggregated statistics", figsize=(30, 17))
);
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/ipykernel_launcher.py:7: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared
  import sys
In [15]:
aggregated_stats.shape
aggregated_stats.head()
Out[15]:
(70, 10)
Out[15]:
num_rows num_features mean_variance mean_skewness num_categorical sum_cardinality_over_categorical categorical_ratio numeric_ratio boolean_ratio constant_ratio
did
3 3196.0 37.0 0.187118 0.000000 37.0 1.947368 1.000000 0.000000 0.000000 0.0
31 1000.0 21.0 0.487300 0.920379 14.0 3.733333 0.666667 0.285714 0.047619 0.0
44 4601.0 58.0 0.222751 11.186639 0.0 0.000000 0.000000 0.982759 0.017241 0.0
72 1000000.0 37.0 0.191597 0.000000 37.0 1.947368 1.000000 0.000000 0.000000 0.0
73 1000000.0 17.0 0.301100 0.000000 17.0 2.500000 1.000000 0.000000 0.000000 0.0

Visualizing the datasets in lower dimensions

In [16]:
from sklearn.preprocessing import StandardScaler
scaled_agg_stats = StandardScaler().fit_transform(capped_agg_stats)
scaled_agg_stats.shape
Out[16]:
(70, 10)
In [17]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, verbose=1, random_state=42)
tsne_results = tsne.fit_transform(scaled_agg_stats)
[t-SNE] Computing 69 nearest neighbors...
[t-SNE] Indexed 70 samples in 0.000s...
[t-SNE] Computed neighbors for 70 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 70 / 70
[t-SNE] Mean sigma: 2.354606
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.328487
[t-SNE] KL divergence after 1000 iterations: 0.258376

t-SNE projections of the dataset

In [18]:
np.random.seed(42)
df_subset = pd.DataFrame()
df_subset["tsne-one"] = tsne_results[:, 0]
df_subset["tsne-two"] = tsne_results[:, 1]
sns.scatterplot(x="tsne-one", y="tsne-two", data=df_subset, ax=new_axis(), s=1000);

Simple clustering

To analyze all experiments, first I need to cluster them together into categories that make sense; The initial idea was to cluster them together by hand, manually selecting clusters that have similar meta-statistics distributions; However, this proved to be somewhat difficult when analyzing all meta-statistics together, so I decided to use a clustering approach and check if the selected clusters made sense.

IMPORTANT:

The number of points (datasets) isn't that big, so I can actually check the dendogram and see if the clusters are well behaved.

I used complete linkage and euclidean metric, using the standardized aggregated statistics, as these were the ones that gave the best clustering structure.

The clustering method, linkage and metrics chosen here can highly impact the analysis I did after. The idea is not to have a generic analysis that can work with any dataset, but to actually measure impact of the hyperparameters into specific clusters that follow a somewhat similar distribution of the aggregated statistics

In [58]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
avg_linkage = linkage(scaled_agg_stats, method='centroid', metric='euclidean', optimal_ordering=False)
plt.figure(figsize=(20, 10))
plt.title('Hierarchical Clustering Dendrogram - Euclidean, complete linkage', fontsize=18)
plt.xlabel('Dataset ID')
plt.ylabel('Distance')
dendrogram(avg_linkage, labels=aggregated_stats.index.values, leaf_rotation=90,leaf_font_size=12.0);
In [19]:
from sklearn.cluster import KMeans
kmeans_learner = KMeans(n_clusters=6, n_init=50, random_state=42)
In [20]:
kmeans_clusters = kmeans_learner.fit_predict(scaled_agg_stats)
In [21]:
centroid_projections = TSNE(n_components=2, perplexity=27, verbose=1, random_state=42).fit_transform(np.vstack([scaled_agg_stats, kmeans_learner.cluster_centers_]))
[t-SNE] Computing 75 nearest neighbors...
[t-SNE] Indexed 76 samples in 0.000s...
[t-SNE] Computed neighbors for 76 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 76 / 76
[t-SNE] Mean sigma: 2.138719
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.366253
[t-SNE] KL divergence after 1000 iterations: 0.216709
In [25]:
# calculated_clusters = fcluster(avg_linkage, t=4, criterion="distance")
calculated_clusters = np.concatenate(
    [kmeans_clusters + 1, np.array(["centroid"] * kmeans_learner.cluster_centers_.shape[0])]
)
In [26]:
df_centroids = pd.DataFrame()
df_centroids["tsne-one"] = centroid_projections[:, 0]
df_centroids["tsne-two"] = centroid_projections[:, 1]
df_centroids["kmeans"] = calculated_clusters
df_centroids["type"] = np.where(df_centroids.kmeans == "centroid", "centroid", "dataset")
ax = new_axis(title="t-SNE projection - Clusters and centroids")
sns.scatterplot(
    x="tsne-one",
    y="tsne-two",
    hue="kmeans",
    style="type",
    data=df_centroids,
    palette=sns.color_palette("hls", len(df_centroids.kmeans.unique())),
    ax=ax,
    s=1000
)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x123bed9e8>
In [181]:
nclusters = len(np.unique(kmeans_clusters))
print(nclusters)
aggregated_stats["cluster"] = kmeans_clusters + 1
capped_agg_stats["cluster"] = kmeans_clusters + 1
6

Getting a sense of each cluster

Check if the distribution of aggregated meta-statistics over each cluster is different. This is more than a sanity check: It's a way to understand what are the main differences and similarities between the clusters!

In [183]:
import joypy
from matplotlib.colors import ListedColormap


def plot_cluster_joyplot(aggregated_df, column):
    cluster_cmap = ListedColormap(sns.color_palette("hls", nclusters).as_hex())
    df = aggregated_df.assign(**{f"{column}": lambda df: df[column] + np.random.random() * 1e-10})
    # We don't want to plot a distribution when there's only one distinct value
    to_remove = (
        aggregated_df.groupby("cluster")[column]
        .apply(lambda x: len(x.unique()))
        .reset_index()
        .query(f"{column} <= 1")
        .cluster.values
    )
    fig, axes = joypy.joyplot(
        df.query("cluster not in @to_remove"),
        by="cluster",
        column=column,
        figsize=(10, 10),
        colormap=cluster_cmap,
        ylim="own",
        overlap=0,
    )
    fig.suptitle(f"Distribution of {column} by cluster", size=18, y=1.05)
In [184]:
aggregated_stats.cluster.values
Out[184]:
array([2, 5, 6, 3, 3, 3, 3, 3, 3, 3, 6, 6, 1, 5, 2, 6, 5, 1, 1, 1, 1, 1,
       1, 1, 6, 5, 5, 1, 1, 1, 6, 2, 5, 6, 6, 6, 1, 5, 1, 6, 5, 1, 5, 1,
       6, 5, 1, 2, 1, 1, 3, 3, 2, 5, 1, 4, 6, 5, 4, 1, 1, 1, 2, 6, 4, 4,
       4, 6, 1, 6], dtype=int32)
In [185]:
aggregated_stats.cluster.value_counts().sort_index()
Out[185]:
1    23
2     6
3     9
4     5
5    12
6    15
Name: cluster, dtype: int64

When checking the distribution of the engineered variables the clusters seem to separate datasets in a good manner

In [186]:
for metric in aggregated_stats.columns[:-1]:
    plot_cluster_joyplot(capped_agg_stats, column=metric)
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/matplotlib/figure.py:98: MatplotlibDeprecationWarning: 
Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  "Adding an axes using the same arguments as a previous axes "

Cluster specific Analysis

Each cluster now has similar datasets, according to the aggregated meta-statistics chosen. The idea is then to analyze, in each cluster, the impact (if there's an impact) of the hyperparameters in the AUC. For this reasons, I'm calculating the delta_auc and other metrics (brier and logloss) from the baseline. The baseline metric for a given dataset and hyperparameter is the metric we would obtain when training a model using LightGBM default parameters (or the closest one to the default parameters).

In [9]:
def calculate_cluster_object(cluster_id, agg_stat_df):
    cluster_dids = agg_stat_df.query(f"cluster == {cluster_id}").index.values
    print(cluster_dids.shape)
    cluster_exps = {}
    for did in cluster_dids:
        cluster_exps[did] = build_result_dataframe(open_experiment(did=did).final_result).assign(did=did)
    print(cluster_exps.keys())
    return pd.concat(cluster_exps.values())
In [10]:
import cloudpickle
import gc
from collections import defaultdict
# for cluster_id in range(1, 7):
#     print(f"\n------- Building cluster {cluster_id} -------\n")
#     curr_cluster = calculate_cluster_object(cluster_id=cluster_id, agg_stat_df=aggregated_stats)
#     open(f"cluster_{cluster_id}.pkl", "wb").write(cloudpickle.dumps(curr_cluster))
#     del curr_cluster
#     gc.collect()

Dataset names for each cluster

In [187]:
for cluster_id in range(1, 7):
    cluster_dids = aggregated_stats.query(f"cluster == {cluster_id}").index.values
    print(f"------- {cluster_id} -------")
    for did in cluster_dids:
        print(f"\t• {open_experiment(did=did).openml_object.name}")

gc.collect()
------- 1 -------
	• visualizing_soil
	• mfeat-morphological
	• analcatdata_halloffame
	• mfeat-fourier
	• JapaneseVowels
	• letter
	• mfeat-factors
	• waveform-5000
	• mfeat-zernike
	• pendigits
	• mfeat-karhunen
	• musk
	• MagicTelescope
	• hill-valley
	• creditcard
	• CreditCardSubset
	• higgs
	• numerai28.6
	• churn
	• Satellite
	• Speech
	• Run_or_walk_information
	• USPS
------- 2 -------
	• kr-vs-kp
	• splice
	• mfeat-pixel
	• PhishingWebsites
	• 20_newsgroups.drift
	• Internet-Advertisements
------- 3 -------
	• BNG(kr-vs-kp)
	• BNG(labor,nominal,1000000)
	• BNG(breast-cancer,nominal,1000000)
	• BNG(mushroom)
	• BNG(colic.ORIG,nominal,1000000)
	• BNG(credit-a,nominal,1000000)
	• BNG(credit-g,nominal,1000000)
	• BNG(credit-g)
	• BNG(spambase)
------- 4 -------
	• solar-flare
	• dis
	• jungle_chess_2pcs_endgame_elephant_elephant
	• jungle_chess_2pcs_endgame_rat_rat
	• jungle_chess_2pcs_endgame_lion_lion
------- 5 -------
	• credit-g
	• socmob
	• nursery
	• cmc
	• car
	• ada_prior
	• adult-census
	• bank-marketing
	• adult
	• Amazon_employee_access
	• mofn-3-7-10
	• parity5_plus_5
------- 6 -------
	• spambase
	• cpu_small
	• elevators
	• segment
	• optdigits
	• page-blocks
	• kc1
	• pc1
	• pc2
	• autoUniv-au1-1000
	• Bioresponse
	• Titanic
	• wilt
	• Sick_numeric
	• telco-customer-churn
Out[187]:
0

Toggle figure saving

In [11]:
SAVE_PLOTS_GLOBAL = True

General utility functions

In [12]:
def update_pointplot_xlabels(multiple_of=5, rotation=0, fontsize=20, xticks=True):
    """update the xlabels and xticks of the last axis passed (using plt.gca).
        with this function, it can plot xticks/labels with different periods, using the
        multiple_of parameter
    """
    new_labels, new_xticks = [], []
    if xticks:
        for i, label in enumerate(plt.gca().get_xticklabels()):
            if i % multiple_of == 0:
                new_labels.append(label)
        for i, tick in enumerate(plt.gca().get_xticks()):
            if i % multiple_of == 0:
                new_xticks.append(tick)
        plt.gca().set_xticks(new_xticks)
    else:
        for i, label in enumerate(plt.gca().get_xticklabels()):
            new_labels.append(label if i % multiple_of == 0 else "")
    plt.gca().set_xticklabels(new_labels, rotation=rotation, fontsize=fontsize)
In [13]:
# Plot the deltas calculated from a given analysis
def plot_deltas(df_list, treatment, y="delta_metric", scale=1.0, cap_outliers=False, save_plot=False, loc=0, **kwargs):
    for df in df_list:
        metric = df.metric.unique()[0]
        if df[treatment].dtype == float:
            df[treatment] = df[treatment].round(5)
        temp_df = df.query(test_pred)
        if cap_outliers:
            temp_df = df.assign(
                **{f"{y}": lambda df: df[y].clip(lower=np.percentile(df[y], 5), upper=np.percentile(df[y], 95))}
            )
        sns.pointplot(
            x=treatment,
            y=y,
            data=temp_df,
            hue="did",
            linestyles="",
            ci=None,
            ax=new_axis(title=f"Delta {metric} (NEW - BASELINE) from LightGBM baseline - Test set", figsize=(22, 10)),
            scale=scale,
        )
        update_pointplot_xlabels(**kwargs)
        plt.legend(loc=loc)
        if save_plot or SAVE_PLOTS_GLOBAL:
            plt.savefig(
                "delta_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            )
        plt.show()
In [14]:
train_pred = "type == 'train'"
test_pred = "type == 'test'"

@fp.curry
def calculate_delta_baseline(did_df, metric="auc_evaluator__target", diff_clause=None):
    diff_clause = ["diff_num_est"] if diff_clause is None else diff_clause
    if isinstance(diff_clause, tuple):
        diff_clause = list(diff_clause)
    """For a given dataframe, calculate the delta of the metric related to the baseline metric,
       which is the metric where diff_column is closer to 0.
       The dataframe is considered as a group from groupby, where each dataframe contains experiment for a given
       dataset id"""
    return did_df.assign(
        baseline_metric=lambda df: df[metric].loc[df.query(test_pred).sort_values(by=diff_clause).index[0]],
        delta_metric=lambda df: df[metric] - df.baseline_metric,
        metric=metric
    ).reset_index(drop=True)
In [15]:
# this is used for multiple experiments
def grouped_experiment(cluster, query_string, name, diff_hyperparam_fn, diff_clause=None):
    diff_cols= (f"diff_{name}") if diff_clause is None else diff_clause
    if isinstance(diff_cols, list):
        diff_cols = tuple(diff_cols)
    delta_baseline_auc_fn = calculate_delta_baseline(metric="auc_evaluator__target", diff_clause=diff_cols)
    delta_baseline_brier_fn = calculate_delta_baseline(metric="brier_score_evaluator__target", diff_clause=diff_cols)
    delta_baseline_logloss_fn = calculate_delta_baseline(metric="logloss_evaluator__target", diff_clause=diff_cols)
    grouped_all_dids = (
        cluster.query(query_string)
        .pipe(diff_hyperparam_fn)
        .groupby("did")
    )

    auc_deltas = grouped_all_dids.apply(delta_baseline_auc_fn)
    brier_deltas = grouped_all_dids.apply(delta_baseline_brier_fn)
    logloss_deltas = grouped_all_dids.apply(delta_baseline_logloss_fn)
    deltas_df_list = [auc_deltas, brier_deltas, logloss_deltas]
    print(auc_deltas.shape)
    return deltas_df_list
In [16]:
# used to convert a dictionary with the p-values into a dataframe
def build_statistics_df(statistical_dict):
    overall_statistics_df = pd.DataFrame()
    for treatment, metric_d in statistical_dict.items():
        treatment_df = pd.DataFrame()
        for metric, statistical_results_d in metric_d.items():
            treatment_df = pd.concat(
                (
                    treatment_df,
                    pd.DataFrame(statistical_results_d, index=[0]).assign(
                        metric=metric[: metric.find("_evaluator__target")]
                    ),
                ),
                ignore_index=True,
            )
        overall_statistics_df = pd.concat(
            (overall_statistics_df, treatment_df.assign(treatment=treatment)), ignore_index=True
        )
    return overall_statistics_df
In [17]:
# used to color red the pvalues < 0.05
def color_negative_red(val):
    color = 'red' if (not isinstance(val, str) and val < 0.05) else 'black'
    return 'color: %s' % color

Analysis of variance - Kruskal-Wallis

Before checking the impact of the hyperparameters directly, we'll first test of there's a statistically significant difference in them.

Basically the steps done to check and analyze the data are:

  • fit a single-factor model on the data, considering the treatment as the hyperparameter being changed. The value is the delta_metric
  • Plot the residuals of the fitted model in a probability plot (Q-Q plot)
  • Check for homoscedasticity using Levene's test

    "If your data are heteroscedastic, Kruskal–Wallis is no better than one-way anova, and may be worse" - http://www.biostathandbook.com/kruskalwallis.html

  • Test for normality of the residuals using Shapiro-Wilk test
  • Perform an ANOVA test. This result is only useful if we guarantee that the ANOVA assumptions HOLD
  • Perform a non-parametric Kruskal-Wallis test. Since I saw that almost none of the residuals are normality distributed (I don't have enough evidence to say they are), this is the main test to check difference in the median of the rankings of delta_metrics.
  • If the kruskal-wallis returned a significant p-value, I proceed to run a Conover pairwise posthoc test for the rank significance between the groups themselves. This procedure here is to check for significant disparities between hyperparameter values
In [18]:
def fit_single_factor_model(df, treatment, value_col="value"):
    """ Fit a single-factor model where:
            y_ij = mu + t_i + eps_ij
        The estimates done are:
            mu_pred = mean(Y)
            t_i_pred = mean(Y_i) - mean(Y), for i = 1, 2, ...n
        Intuitively, the overall mean is estimated by the grand average of the observations and
        that any treatment effect is just the difference between the treamtment average and the grand average.
    """
    # treatment must be the the index of the dataframe
    mu_pred = df[value_col].values.mean()
    output_cols = ["treatment_mean", "mu_pred", "tau_i_pred"]
    return (
        df.assign(
            treatment_mean=lambda df: df.groupby(treatment).transform(np.mean)[value_col],
            tau_i_pred=lambda df: df.treatment_mean - mu_pred,
            mu_pred=mu_pred,
        )
        .groupby(treatment)[output_cols]
        .agg("first")
    )
In [19]:
import scipy.stats as stats
import pingouin as pg


def plot_residuals(df, sfm_params, treatment, value_col="value", normalize=True, scipy_plot=False):
    residuals_df = df.merge(right=sfm_params, on=treatment).assign(
        y_treatment_pred=lambda df: df.mu_pred + df.tau_i_pred, residual=lambda df: df[value_col] - df.y_treatment_pred
    )
    if residuals_df.residual.sum() == 0.0:
        print("------ Treatment has no residuals -  Cannot apply Single-factor model -------")
        return residuals_df
    residuals_vector = residuals_df.residual.values    
    if normalize:
        res = (residuals_vector - np.mean(residuals_vector)) / np.std(residuals_vector)
    else:
        res = residuals_vector
    ax = new_axis()
    if scipy_plot:
        stats.probplot(res, plot=ax)
    else:
        pg.qqplot(res, dist="norm", ax=ax)
    metric = df.metric.unique()[0]
    if SAVE_PLOTS_GLOBAL:
        plt.savefig(
                "qqplot_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            )
    return residuals_df
In [20]:
def plot_sfm_model(
    sfm_model,
    treatment,
    metric,
    residuals_width=3,
    num_points_to_highlight=None,
    highlight_size=100,
    lower_is_better=True,
    **kwargs,
):

    fig, ax = plt.subplots(figsize=(20, 10))
    curr_df = sfm_model.reset_index().sort_values(treatment)
    neg_df = curr_df.query("treatment_mean-mu_pred <= 0")
    pos_df = curr_df.query("treatment_mean-mu_pred > 0")
    # the num of points is equivalent of taking the mean of points in each category and getting 25% of the mean number
    num_points_to_highlight = (
        int((pos_df.shape[0] + neg_df.shape[0]) * 0.12) if num_points_to_highlight is None else num_points_to_highlight
    )
    num_points_to_highlight = 5 if num_points_to_highlight <= 0 else num_points_to_highlight
    c1, c2, c3, c4, c5 = sns.color_palette("deep")[:5]
    # categorical data needs to be treated differently
    if pd.api.types.is_categorical_dtype(curr_df[treatment]):
        ax.plot(
            list(curr_df[treatment].values),
            list(curr_df["mu_pred"].values),
            color=c1,
            linewidth=3,
            label="overall mean",
        )
    else:
        ax.plot(curr_df[treatment], curr_df["mu_pred"], color=c1, linewidth=3, label="overall mean")
    ax.bar(
        pos_df[treatment],
        height=pos_df["treatment_mean"] - pos_df["mu_pred"].iloc[0],
        bottom=pos_df["mu_pred"].iloc[0],
        width=residuals_width,
        color=c2,
        edgecolor=None,
        linewidth=0.1,
    )
    ax.bar(
        neg_df[treatment],
        height=neg_df["tau_i_pred"].abs(),
        bottom=neg_df["treatment_mean"],
        width=residuals_width,
        color=c3,
        edgecolor=None,
        linewidth=0.1,
    )
    top_treatment_means = pos_df.sort_values(by="treatment_mean", ascending=False).iloc[:num_points_to_highlight]
    worst_treatment_means = neg_df.sort_values(by="treatment_mean").iloc[:num_points_to_highlight]
    if not top_treatment_means.empty:
        ax.scatter(
            top_treatment_means[treatment],
            top_treatment_means["treatment_mean"],
            s=highlight_size,
            c=np.array(c4)[:, None].T if lower_is_better else np.array(c5)[:, None].T,
            zorder=120,
            label="worst treatments" if lower_is_better else "best treatments",
        )
    if not worst_treatment_means.empty:
        ax.scatter(
            worst_treatment_means[treatment],
            worst_treatment_means["treatment_mean"],
            s=highlight_size,
            c=np.array(c5)[:, None].T if lower_is_better else np.array(c4)[:, None].T,
            zorder=120,
            label="best treatments" if lower_is_better else "worst treatments",
        )
    ax.legend()
    ax.set_xlabel(treatment), ax.set_ylabel("delta_metric"), ax.set_title(f"Single-factor model - {metric}", size=18)
    if kwargs:
        fig.canvas.draw()  # draw the figure before updating the xlabels
        update_pointplot_xlabels(**kwargs)
        if "plot_original" in kwargs:
            ax.set_xticks(curr_df[treatment].values)
            ax.set_xticklabels(curr_df[treatment].values)
    if SAVE_PLOTS_GLOBAL:
        plt.savefig(
                "sfm_" + metric[: metric.find("_")] + "_cluster" + str(curr_cluster) + "_" + treatment + ".png",
                bbox_inches="tight",
                pad_inches=0,
            ) 
    return dict(metric=metric, sfm_df=curr_df)
In [21]:
from IPython.display import display
import scikit_posthocs as sp  # perform posthoc tests


def perform_individual_sfm_analysis(
    df_list,
    treatment,
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    run_sfm_plot=True,
    sfm_plot_args=None,
    statistical_test_dict=None
):
    def index_dataframe(df):
        return (
            df.query(test_pred)
            .reset_index(drop=True)
            .assign(
                delta_metric=lambda df: df.delta_metric.clip(
                    lower=np.percentile(df.delta_metric, 5), upper=np.percentile(df.delta_metric, 95)
                )
            )
        )

    df_list = [index_dataframe(df) for df in df_list]
    sfm_plot_args = {} if sfm_plot_args is None else sfm_plot_args

    print(
        "Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution\n"
    )
    print(
        "Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal"
    )
    sp_heatmap_args = {
        "linewidths": 0.25,
        "linecolor": "0.5",
        "clip_on": False,
        "square": True,
        "cbar_ax_bbox": [0.80, 0.35, 0.04, 0.3],
    }
    sfm_dict = {}

    def loop_over_experiments(function):
        return_results = []
        for df in df_list:
            res = function(df)
            if res is not None:
                return_results.append(res)
        return return_results

    # define analysis to be performed for each dataset
    def residuals(df):
        print(f"Residuals probplot: {df.metric.unique()[0]}")
        res = plot_residuals(
            df=df,
            sfm_params=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric"),
            treatment=treatment,
            value_col="delta_metric",
        )
        plt.show()
        residuals_shapiro = res.residual.values
        if residuals_shapiro.shape[0] > 5000:
            residuals_shapiro = res.residual.sample(n=5000, random_state=42)
        p_shapiro = stats.shapiro(residuals_shapiro)[1]
        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = {"shapiro": p_shapiro}
        print(f"\nShapiro-Wilk p-value = {p_shapiro}. {'REJECT NORMALITY' if p_shapiro <= 0.05 else 'CANNOT REJECT'}")

    def homoscedasticity(df):
        print(f"\nHomoscedasticity - Metric: {df.metric.unique()[0]}")
        hmscd = pg.homoscedasticity(data=df, dv="delta_metric", group=treatment)
        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
                statistical_test_dict[treatment][df.metric.unique()[0]],
                {"levene": hmscd.pval.unique()[0]}
            )
        display(hmscd)

    def anova(df):
        print(f"\nANOVA test - Metric: {df.metric.unique()[0]}")
        display(df.anova(dv="delta_metric", between=treatment, detailed=True))

    def kruskal(df):
        print(f"\nKruskal-wallis test - Metric: {df.metric.unique()[0]}")
        sample = []
        for d_group_idx in df.groupby(treatment).groups.values():
            if len(df.loc[d_group_idx].delta_metric.values) < 2:
                continue
            sample.append(df.loc[d_group_idx].delta_metric.values)
        if len(sample) < 2 or any((len(treatment_group) < 2 for treatment_group in sample)):
              print("--- Not enough Samples in treatment group ---")
              return {}
        print(stats.kruskal(*sample))
        _, kruskal_p = stats.kruskal(*sample)

        if statistical_test_dict is not None:
            statistical_test_dict[treatment][df.metric.unique()[0]] = fp.merge(
                statistical_test_dict[treatment][df.metric.unique()[0]],
                {"kruskal": kruskal_p}
            )
        # specific threshold when the group is statistically significant at 95%
        if kruskal_p < 0.05:
            if run_kruskal_posthoc:
                ph_conover = sp.posthoc_conover(df, val_col="delta_metric", group_col=treatment, p_adjust="holm")
                sp.sign_plot(
                    ph_conover,
                    ax=new_axis(figsize=(10, 8), title=f"Conover p-values for {df.metric.unique()[0]}"),
                    **sp_heatmap_args,
                )
            if run_sfm_plot:
                delta_lower_is_better = dict(
                    brier_score_evaluator__target=True, logloss_evaluator__target=True, auc_evaluator__target=False
                )
                updated_sfm_plot_args = fp.merge(
                    sfm_plot_args, dict(lower_is_better=delta_lower_is_better[df.metric.unique()[0]])
                )
                curr_sfm_dict = plot_sfm_model(
                    sfm_model=fit_single_factor_model(df, treatment=treatment, value_col="delta_metric").reset_index(),
                    treatment=treatment,
                    metric=df.metric.unique()[0],
                    **updated_sfm_plot_args,
                )
                return curr_sfm_dict
        return {}

    # apply each analysis to every dataset
    if run_residuals:
        loop_over_experiments(residuals)
    if run_homoscedasticity:
        loop_over_experiments(homoscedasticity)
    if run_anova:
        loop_over_experiments(anova)
    if run_kruskal:
        all_sfm_models = loop_over_experiments(kruskal)
        return all_sfm_models
    return None
In [22]:
from collections import defaultdict
all_cluster_results = defaultdict(dict)

Cluster 1

In [31]:
cluster_1 = pickle.load(open("cluster_1.pkl", "rb"))
cluster_1.shape
print("Number of datasets:", len(cluster_1.did.unique()))
curr_cluster = 1
Out[31]:
(121670, 13)
Number of datasets: 23
In [32]:
to_remove_1 = [1597, 4154, 40922]
cluster_1 = cluster_1.query("did not in @to_remove_1")
In [33]:
st_tests_1 = defaultdict(dict)

Individual Treatments

Num estimators

In [34]:
individual_num_est_pred = (
    "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate != learning_rate)"
)


def individual_num_est_diff(df):
    return df.assign(
        # default is 100
        num_estimators=lambda df: df.num_estimators.astype(int),
        diff_num_est=lambda df: abs(df.num_estimators - 100),
    )


num_est_df_list = grouped_experiment(
    cluster=cluster_1, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(800, 17)
In [35]:
plot_deltas(num_est_df_list, treatment="num_estimators")

When changing num_estimators individually, Kruskal-Willis doesn't provide enough evidence to reject null hypothesis

In [36]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 4.965993800110704e-19. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 3.7109030667859604e-24. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 6.203025343564139e-23. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.744 0.963061 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.474 0.999994 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.704 0.982854 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=8.597988809017151, pvalue=0.9951831715199703)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=10.657224403362845, pvalue=0.9793032581227489)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=51.4394967779547, pvalue=0.0003733918867836595)

Max depth

In [37]:
individual_max_depth_pred = (
    "(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
)


def individual_max_depth_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
    )


max_depth_df_list = grouped_experiment(
    cluster=cluster_1,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(800, 17)
In [38]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [39]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 3.388218626825648e-21. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 8.838670219986008e-15. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 9.113384688824028e-14. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.659 0.859025 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.155 0.294177 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.399 0.123112 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=21.882460441553246, pvalue=0.290145041334565)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=46.52563871369858, pvalue=0.00041740480967144655)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=35.40856429204412, pvalue=0.012465099807488124)

Learning Rate

In [40]:
individual_lr_pred = "(num_estimators != num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"


def individual_lr_diff(df):
    return df.assign(
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


lr_df_list = grouped_experiment(
    cluster=cluster_1, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(200, 17)
In [41]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=3, rotation=45)
In [42]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.002, highlight_size=70, multiple_of=1),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.0133217642760428e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 5.6172714456249537e-14. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 7.958838801761306e-14. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.552 0.981121 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 2.399 0.001409 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 2.465 0.00105 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=22.877959309494454, pvalue=0.0287800782055724)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=54.38738027864609, pvalue=2.3299816669512344e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=53.11633590474057, pvalue=3.9276239350538573e-07)

Multiple Treatments

When analyzing multiple parameters being run together, the baseline is determined to be the hyperparameter combination with the lowest delta of both of them. Basically, get all the experiments where the diff between hyperparameter a is the minimum, and from those, get the experiment where the hyperparameter b is the minimum, and so on.

Multiple treatments - Utility function

In [43]:
from functools import reduce

def create_multiple_treatment_column(df, columns, treatment_name_output, order_strategy="rank"):
    """When dealing with multiple treatments, this function will create an ordered categorical column,
       which represents a given experiment combination o hyperparameters.
       The ordering can be done by either passing:
           - order_strategy = "rank",
               which means that the ordering is based on the mean of the individual ranks
               of each hyperparameter
           - order_strategy = "column"
               which means that the ordering is based on the sort_values result of the
               'columns' argument.            
       """
    def create_categorical_col(df):
        all_str_cols = [df[col].map(str) for col in columns]
        return df.assign(
            **{
                f"{treatment_name_output}": pd.Series(
                    reduce(lambda x, y: x + " - " + y, all_str_cols), dtype="category"
                )
            }
        )

    def calc_ranked_mean(df):
        # calculate the mean of the ranks of all treatment columns
        all_ranks = [df[col].rank() for col in columns]
        return df.assign(ranked_mean=reduce(lambda x, y: x + y, all_ranks) / len(all_ranks))

    def set_cat_orders(df, ordering_from, categorical_col):
        # set the categories order sorting the dataframe by a given 'ordering_from' column
        ordering_from = list(ordering_from) if isinstance(ordering_from, tuple) else ordering_from
        ord_categories = df.sort_values(by=ordering_from)[categorical_col].unique().astype(str)

        return df.assign(
            **{
                f"{categorical_col}": lambda df: df[categorical_col].cat.reorder_categories(
                    ord_categories, ordered=True
                )
            }
        )

    # the categorical treatment is just a concatenation of all the treatments applied

    if order_strategy is "rank":
        orders_col = "ranked_mean"
        return (
            df.pipe(create_categorical_col)
            .pipe(calc_ranked_mean)
            .pipe(set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output)
        ).drop(columns=orders_col)
    elif order_strategy is "column":
        orders_col = tuple(columns)
        return df.pipe(create_categorical_col).pipe(
            set_cat_orders, ordering_from=orders_col, categorical_col=treatment_name_output
        )
    else:
        raise ValueError("Invalid ordering strategy! Needs to be 'rank' or 'column'")

Max Depth and Learning Rate

In [44]:
max_depth_lr_pred = "(num_estimators != num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_max_depth", "diff_lr")


def max_depth_lr_diff(df):
    return df.assign(
        learning_rate=lambda df: df.learning_rate.astype(float),
        max_depth=lambda df: df.max_depth.astype(int),
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
    )


max_depth_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(4000, 18)
In [45]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)
In [46]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.4018849621728907e-32. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.1770907100328463e-43. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 6.678448351125646e-41. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.741 0.999998 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.027 0.336134 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.745 9.901324e-19 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=470.54805773830856, pvalue=6.661712194766211e-08)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=1127.244254064324, pvalue=1.0673657558692864e-90)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1127.9791459804142, pvalue=8.193801894234736e-91)

Max Depth and Num Estimators

In [47]:
max_depth_num_est_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate != learning_rate)"
# minimize the difference of max_depth then the num_estimators
diff_clause = ("diff_max_depth", "diff_num_est")


def max_depth_num_est_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        num_estimators=lambda df: df.num_estimators.astype(int),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
    )


max_depth_num_est_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(16000, 18)
In [48]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=50, rotation=90, loc=1, scale=.8)
In [49]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.367 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.487 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.588 1.0 True
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est 0.033 2179 0.0 2.232 2.9517e-125 0.455
1 Within 0.039 5820 0.0 - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est 0.147 2179 0.0 8.318 0 0.757
1 Within 0.047 5820 0.0 - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est 11.851 2179 0.005 2.631 1.94873e-183 0.496
1 Within 12.031 5820 0.002 - - -
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=307.1018212402173, pvalue=0.9999999934016638)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=202.3588068667664, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=580.0950520045608, pvalue=0.00010256012318249144)

Num Estimators and Learning Rate

In [50]:
num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth != max_depth) & (learning_rate == learning_rate)"
# minimize the difference of max_depth then the learning_rate
diff_clause = ("diff_num_est", "diff_lr")


def num_est_lr_diff(df):
    return df.assign(
        num_estimators=lambda df: df.num_estimators.astype(int),
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


num_est_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(4000, 18)
In [51]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)
In [52]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=2.5, highlight_size=30, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.98 0.624096 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.717 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.471 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=309.33390710727934, pvalue=0.2852538604668392)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=545.8637960194865, pvalue=4.590394996634476e-17)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=691.5428909449221, pvalue=1.0885943750452042e-33)

Max Depth, Num Estimators and Learning Rate

In [58]:
max_depth_num_est_lr_pred = "(num_estimators == num_estimators) & (max_depth == max_depth) & (learning_rate == learning_rate)"

diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")


def max_depth_num_est_lr_diff(df):
    return df.assign(
        max_depth=lambda df: df.max_depth.astype(int),
        num_estimators=lambda df: df.num_estimators.astype(int),
        learning_rate=lambda df: df.learning_rate.astype(float),
        diff_max_depth=lambda df: abs(df.max_depth - 5),  # default is max_depth = 5 because n_leaves = 31
        diff_num_est=lambda df: abs(df.num_estimators - 100), # default num_estimators is 100
        diff_lr=lambda df: abs(df.learning_rate - 0.1),  # default lr is 0.1
    )


max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_1,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(80000, 19)
In [59]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=300, rotation=90, fontsize=15)
In [60]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_1
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.906 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.634 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.498 1.0 True
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est_lr 0.709 25059 0.0 1.847 0 0.756
1 Within 0.229 14940 0.0 - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est_lr 3.111 25059 0.0 5.444 0 0.901
1 Within 0.341 14940 0.0 - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 max_depth_num_est_lr 130.746 25059 0.005 6.081 0 0.911
1 Within 12.818 14940 0.001 - - -
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=6657.281135550946, pvalue=1.0890241542298173e-10)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=10858.797788216523, pvalue=6.880061494337646e-293)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=13186.28648910838, pvalue=0.0)
In [61]:
del cluster_1
gc.collect()
Out[61]:
8536061

Cluster 2

In [62]:
st_tests_2 = defaultdict(dict)
In [63]:
cluster_2 = pickle.load(open("cluster_2.pkl", "rb"))
cluster_2.shape
print("Number of datasets:", len(cluster_2.did.unique()))
curr_cluster = 2
Out[63]:
(31740, 13)
Number of datasets: 6
In [64]:
to_remove_2 = [40517]
cluster_2 = cluster_2.query("did not in @to_remove_2")

Individual treatments

Num estimators

In [65]:
num_est_df_list = grouped_experiment(
    cluster=cluster_2, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(200, 17)
In [66]:
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3, save_plot=True)
In [67]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 4.0599200445967654e-08. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 5.942564257566119e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.502680399629753e-05. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.423 0.996812 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.257 0.210545 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.959 0.009636 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=4.679600286941011, pvalue=0.9996505615150002)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.4674407373969265, pvalue=0.9965571834951797)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=40.48131129638574, pvalue=0.002827827988640876)

Max depth

In [68]:
max_depth_df_list = grouped_experiment(
    cluster=cluster_2,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(200, 17)
In [69]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [70]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.536182097667705e-10. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 3.926060799130937e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.5221535906894132e-05. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.563 0.921002 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.202 0.277463 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.449 0.128518 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=33.954404013077685, pvalue=0.01860650187092958)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=33.19283069085734, pvalue=0.02283746743609913)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=34.1495018211372, pvalue=0.0176453054611716)

Learning Rate

In [71]:
lr_df_list = grouped_experiment(
    cluster=cluster_2, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(50, 17)
In [72]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
In [73]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.714980462670269e-09. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 9.808684531265044e-09. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.2135402405188955e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:2352: RuntimeWarning: invalid value encountered in double_scalars
  W = numer / denom
W pval equal_var
levene NaN NaN False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene NaN NaN False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene NaN NaN False
ANOVA test - Metric: auc_evaluator__target
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/pingouin/parametric.py:993: RuntimeWarning: invalid value encountered in double_scalars
  mserror = sserror / ddof2
Source SS DF MS F p-unc np2
0 learning_rate 0.001 24 0 - - 1
1 Within 0.000 0 - - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.033 24 0.001 - - 1
1 Within 0.000 0 - - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.338 24 0.014 - - 1
1 Within 0.000 0 - - - -
Kruskal-wallis test - Metric: auc_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: brier_score_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: logloss_evaluator__target
--- Not enough Samples in treatment group ---

Multiple Treatments

Max Depth and Learning Rate

In [74]:
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(1000, 18)
In [75]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=10, rotation=90)
In [76]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 1.406702218796685e-20. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.3399940012604214e-23. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.334959806507663e-23. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.563 0.999994 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.202 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.183 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=155.85228427854736, pvalue=0.013169797926850337)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=255.09139169126348, pvalue=6.235464053873864e-12)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=226.75563438842394, pvalue=1.005024961982302e-08)

Max Depth and Num Estimators

In [77]:
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(4000, 18)
In [78]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)
In [79]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2    
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.548461728664323e-22. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 6.49431955042634e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 3.9740927216797714e-21. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.474 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.834 0.996988 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.886 2.655356e-23 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=240.12035494493966, pvalue=0.9999999999778888)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=358.61014295798583, pvalue=0.9274815591224476)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=731.9774069460274, pvalue=6.191509692035732e-22)

Num Estimators and Learning Rate

In [80]:
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(1000, 18)
In [81]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)
In [82]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 9.960932013158139e-27. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.7090848584434127e-26. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.452305074789782e-24. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.25 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.517 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.499 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=179.62919106428052, pvalue=3.3063320135159447e-06)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=204.76897786258806, pvalue=7.035798060397642e-09)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=172.40807068535076, pvalue=1.6503038404775902e-05)

Max Depth, Num Estimators and Learning Rate

In [83]:
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_2,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(20000, 19)
In [84]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)
In [85]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_2
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.463 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.402 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.706 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=3339.498345738567, pvalue=2.9041723270259444e-64)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=3844.687582875137, pvalue=4.013154847570665e-111)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=3405.4581385816687, pvalue=7.298815542361538e-70)
In [86]:
del cluster_2
gc.collect()
Out[86]:
2437679

Cluster 3

In [87]:
cluster_3 = pickle.load(open("cluster_3.pkl", "rb"))
cluster_3.shape
print("Number of datasets:", len(cluster_3.did.unique()))
curr_cluster = 3
Out[87]:
(37122, 13)
Number of datasets: 9
In [88]:
st_tests_3 = defaultdict(dict)

Individual treatments

Num estimators

In [89]:
num_est_df_list = grouped_experiment(
    cluster=cluster_3, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(288, 17)
In [90]:
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)
In [91]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 5.611897678025159e-16. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.0502217978890513e-10. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 8.273422225091309e-11. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.278 0.998826 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.701 0.812424 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.861 0.630281 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=26.583542474712893, pvalue=0.11473383075040258)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=27.469630833080444, pvalue=0.09418066432195568)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=22.36772110694076, pvalue=0.2663393451829624)

Max depth

In [92]:
max_depth_df_list = grouped_experiment(
    cluster=cluster_3,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(284, 17)
In [93]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [94]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 3.1393963695336424e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 3.959431705879979e-05. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0006005177274346352. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 1.829 0.026452 False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.767 0.034193 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.758 0.035519 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=54.72922010202657, pvalue=2.5569975651630184e-05)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=59.7876087606679, pvalue=4.180864080550115e-06)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=60.98327620987113, pvalue=2.7021742961013124e-06)

Learning Rate

In [95]:
lr_df_list = grouped_experiment(
    cluster=cluster_3, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(82, 17)
In [96]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
In [97]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_3

)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.36764296889305115. CANNOT REJECT
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.005871765315532684. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.019388489425182343. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 1.742 0.162112 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 6.179 0.000683 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 10.879 0.000007 False
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.002 4 0.0 9.136 3.35471e-05 0.504
1 Within 0.002 36 0.0 - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.004 4 0.001 21.993 2.96896e-09 0.71
1 Within 0.001 36 0.000 - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.059 4 0.015 16.643 8.28258e-08 0.649
1 Within 0.032 36 0.001 - - -
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=32.16855005115718, pvalue=1.767195777354127e-06)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=32.44497030597138, pvalue=1.5515294888437792e-06)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=32.38209555818483, pvalue=1.598157733407212e-06)

Multiple Treatments

Max Depth and Learning Rate

In [98]:
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(1412, 18)
In [99]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=5, rotation=90)
In [100]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=5, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 8.839740869631338e-14. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 5.927691972829052e-07. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 3.8707453908770617e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.975 0.549628 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.911 0.713235 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.299 0.036468 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=287.8379836594564, pvalue=2.5342539943254588e-20)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=459.3187148304025, pvalue=8.655282647472749e-48)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=481.2119106630078, pvalue=1.441530528707626e-51)

Max Depth and Num Estimators

In [101]:
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(5608, 18)
In [102]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)
In [103]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 3.996993674716892e-40. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.490148176916481e-35. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.679124678037066e-35. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 1.221 0.003525 False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.151 0.029585 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 5.865 1.031149e-169 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=761.3073330765131, pvalue=6.177611536713735e-25)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=731.4999998339297, pvalue=6.91110841192281e-22)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1328.3887078634368, pvalue=3.001167872734788e-100)

Num Estimators and Learning Rate

In [104]:
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(1424, 18)
In [105]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=5, rotation=90)
In [106]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=2, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 8.22391056663302e-28. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 4.965566242157312e-27. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.4469581259610162e-34. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 2.868 3.639094e-15 False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 2.166 1.425324e-08 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 2.432 5.364655e-11 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=279.4335305709853, pvalue=4.0847323931351827e-19)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=305.4636680654049, pvalue=6.548453243989413e-23)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=317.38372404030315, pvalue=1.0634544914481051e-24)

Max Depth, Num Estimators and Learning Rate

In [107]:
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_3,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(28024, 19)
In [108]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=100, cap_outliers=True, rotation=90, fontsize=15)
In [109]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=100, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_3
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 1.384 2.067193e-23 False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.218 1.522520e-09 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 6.582 0.0 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=3880.881000478823, pvalue=2.8474863833491625e-123)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=3903.9984289943445, pvalue=1.0169721378366844e-125)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=6768.312744198603, pvalue=0.0)
In [110]:
del cluster_3
gc.collect()
Out[110]:
688354

Cluster 4

In [111]:
cluster_4 = pickle.load(open("cluster_4.pkl", "rb"))
cluster_4.shape
print("Number of datasets:", len(cluster_4.did.unique()))
curr_cluster = 4
Out[111]:
(26450, 13)
Number of datasets: 5
In [112]:
st_tests_4 = defaultdict(dict)

Individual treatments

Num estimators

In [113]:
num_est_df_list = grouped_experiment(
    cluster=cluster_4, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(200, 17)
In [114]:
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=1)
In [115]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 3.616501087315427e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.482371946888051e-13. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.3367034909506081e-11. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.396 0.987383 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.293 0.998055 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.55 0.929287 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.620332212345332, pvalue=0.2111212183222945)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.3121786636248896, pvalue=0.9970747844229544)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=8.771339376838563, pvalue=0.9770873121481394)

Max depth

In [116]:
max_depth_df_list = grouped_experiment(
    cluster=cluster_4,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(200, 17)
In [117]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [118]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.5372391687650264e-11. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.0192414947596262e-06. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 6.850467970664909e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.288 0.998278 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.321 0.996463 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.201 0.999868 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=8.58273429135731, pvalue=0.9797862535391266)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=6.669608838751563, pvalue=0.9957777576843745)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=3.894313374853578, pvalue=0.9999135832022281)

Learning Rate

In [119]:
lr_df_list = grouped_experiment(
    cluster=cluster_4, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(50, 17)
In [120]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=1, rotation=45)
In [121]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_4

)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.831652696182573e-07. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 3.6365166433682816e-09. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 3.6365166433682816e-09. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:2352: RuntimeWarning: divide by zero encountered in double_scalars
  W = numer / denom
W pval equal_var
levene inf 0.0 False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene inf 0.0 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.519516e+28 6.581044e-29 False
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.001 22 0.0 27.531 0.0356127 0.997
1 Within 0.000 2 0.0 - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.061 22 0.003 541.364 0.00184533 1
1 Within 0.000 2 0.000 - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.591 22 0.027 191.857 0.00519743 1
1 Within 0.000 2 0.000 - - -
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=0.0, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=2.666666666666665, pvalue=0.10247043485974916)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=2.666666666666665, pvalue=0.10247043485974916)

Multiple Treatments

Max Depth and Learning Rate

In [122]:
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(1000, 18)
In [123]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=15, rotation=90)
In [124]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 9.831066270845763e-33. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 9.176373442839894e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.758600865786558e-30. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.573 0.999962 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.419 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.588 0.999916 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=90.78502869442515, pvalue=0.7098868380095777)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=212.43190290430337, pvalue=2.899170199132208e-10)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=201.48393260616163, pvalue=5.5482956205515515e-09)

Max Depth and Num Estimators

In [125]:
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(4000, 18)
In [126]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=10, rotation=90)
In [127]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=10, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 3.372528836164431e-39. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 4.203895392974451e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.435 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.208 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.486 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=339.2970243268268, pvalue=0.986281599326002)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=97.55001096543852, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=154.25820639226663, pvalue=1.0)

Num Estimators and Learning Rate

In [128]:
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(1000, 18)
In [129]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=20, rotation=90)
In [130]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 1.4393615174123851e-30. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.2179191268417695e-29. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 2.4045491119171254e-26. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.627 0.999498 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.658 0.998411 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.751 0.977231 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=24.00130169163542, pvalue=0.9999999999999996)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=198.99178651367794, pvalue=1.0663779734024478e-08)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=202.30317593403936, pvalue=4.4689893467934435e-09)

Max Depth, Num Estimators and Learning Rate

In [131]:
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_4,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(20000, 19)
In [132]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=200, cap_outliers=True, rotation=90, fontsize=15)
In [133]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=200, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_4
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.608 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.662 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.725 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=520.7314796215171, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=4002.1631696522404, pvalue=2.812654964692408e-136)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=4155.8783421483795, pvalue=2.5010470783739023e-153)
In [134]:
del cluster_4
gc.collect()
Out[134]:
1729047

Cluster 5

In [135]:
cluster_5 = pickle.load(open("cluster_5.pkl", "rb"))
cluster_5.shape
print("Number of datasets:", len(cluster_5.did.unique()))
curr_cluster = 5
Out[135]:
(63480, 13)
Number of datasets: 12
In [136]:
st_tests_5 = defaultdict(dict)

Individual treatments

Num estimators

In [137]:
num_est_df_list = grouped_experiment(
    cluster=cluster_5, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(480, 17)
In [138]:
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=10, rotation=90)
In [139]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.727655883279223e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.3163783391400017e-11. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 5.203109609830392e-10. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.407 0.999992 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.992 0.508515 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 1.61 0.005822 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.034257944485756, pvalue=0.813848923944241)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=7.55816240299047, pvalue=0.9999895813836994)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=5.545052419855335, pvalue=0.9999997464184317)

Max depth

In [140]:
max_depth_df_list = grouped_experiment(
    cluster=cluster_5,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(480, 17)
In [141]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [142]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 1.0566748492011704e-27. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 5.6355290918896e-28. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.1003471403235982e-27. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.191 0.999937 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.185 0.999951 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.202 0.999904 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=21.125166655182287, pvalue=0.3299261830545254)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=50.18929982045118, pvalue=0.00012291164120722053)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=35.32664373843109, pvalue=0.01275361758982555)

Learning Rate

In [143]:
lr_df_list = grouped_experiment(
    cluster=cluster_5, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(120, 17)
In [144]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)
In [145]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.003, highlight_size=200, multiple_of=5),
    statistical_test_dict=st_tests_5

)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 6.518039688714616e-13. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 3.448694644745884e-14. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.1609338707377348e-13. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.247 0.999857 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.321 0.998301 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.264 0.999716 True
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.020 44 0.0 5.1 0.00066276 0.937
1 Within 0.001 15 0.0 - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.033 44 0.001 28.117 7.06987e-09 0.988
1 Within 0.000 15 0.000 - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.248 44 0.006 6.018 0.000244743 0.946
1 Within 0.014 15 0.001 - - -
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=12.001207729468597, pvalue=0.017342286509862016)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=16.94166666666667, pvalue=0.0019840472937602757)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=16.76893939393939, pvalue=0.0021432893857816653)

Multiple Treatments

Max Depth and Learning Rate

In [146]:
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(2400, 18)
In [147]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=50, rotation=90)
In [148]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 2.3715745229534633e-30. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.0799779238582079e-39. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 3.393346335573849e-36. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.224 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.569 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.486 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=563.1634405126533, pvalue=3.2814490033108474e-28)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=359.70394069600377, pvalue=7.005511102826335e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=357.0345473723235, pvalue=1.1157110518754798e-06)

Max Depth and Num Estimators

In [149]:
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(9600, 18)
In [150]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=25, rotation=90)
In [151]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=25, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 2.802596928649634e-45. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 4.049752561898721e-43. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.411 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.881 0.998137 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 2.263 6.341819e-85 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=308.1723963216093, pvalue=1.0)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=270.7711345687177, pvalue=1.0)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=291.8148922312212, pvalue=1.0)

Num Estimators and Learning Rate

In [152]:
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(2400, 18)
In [153]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=50, rotation=90)
In [154]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=50, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.156 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.565 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.567 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=335.8833313447127, pvalue=1.6632132589091608e-08)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=409.0136802114733, pvalue=7.634533258312381e-16)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=405.9054563728135, pvalue=1.6836721419703596e-15)

Max Depth, Num Estimators and Learning Rate

In [155]:
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_5,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(48000, 19)
In [156]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)
In [157]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=450, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_5
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.154 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.564 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.792 1.0 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=6230.126585556639, pvalue=1.2520961615352599e-92)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=7971.974493586129, pvalue=1.0950526833361547e-251)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=8169.19716833423, pvalue=8.787394265494155e-273)
In [158]:
del cluster_5
gc.collect()
Out[158]:
6168473

Cluster 6

In [296]:
cluster_6 = pickle.load(open("cluster_6.pkl", "rb"))
cluster_6.shape
print("Number of datasets:", len(cluster_6.did.unique()))
curr_cluster = 6
Out[296]:
(79350, 13)
Number of datasets: 15
In [297]:
to_remove_6 = [1069]
cluster_6 = cluster_6.query("did not in @to_remove_6")
In [298]:
st_tests_6 = defaultdict(dict)

Individual treatments

Num estimators

In [162]:
num_est_df_list = grouped_experiment(
    cluster=cluster_6, query_string=individual_num_est_pred, name="num_est", diff_hyperparam_fn=individual_num_est_diff
)
(560, 17)
In [163]:
plot_deltas(num_est_df_list, treatment="num_estimators", multiple_of=3)
In [164]:
all_cluster_results[curr_cluster]["num_estimators"] = perform_individual_sfm_analysis(
    num_est_df_list,
    treatment="num_estimators",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 5.555299465885219e-15. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 4.446019561624853e-13. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 5.699214175271987e-15. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 1.392 0.073311 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.619 0.016954 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 2.245 0.00013 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=50.85164651209711, pvalue=9.810169022173583e-05)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=68.8504365366667, pvalue=1.426687289385142e-07)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=124.9472351169389, pvalue=1.3096694700133999e-17)

Max depth

In [165]:
max_depth_df_list = grouped_experiment(
    cluster=cluster_6,
    query_string=individual_max_depth_pred,
    name="max_depth",
    diff_hyperparam_fn=individual_max_depth_diff,
)
(560, 17)
In [166]:
plot_deltas(max_depth_df_list, treatment="max_depth", multiple_of=1)
In [167]:
all_cluster_results[curr_cluster]["max_depth"] = perform_individual_sfm_analysis(
    max_depth_df_list,
    treatment="max_depth",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=0.3, highlight_size=400),
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 8.323186449279008e-18. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 8.587675795781614e-18. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 8.382107814712916e-16. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.489 0.965933 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.632 0.880817 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.781 0.729366 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=23.225483683108557, pvalue=0.22755618937135969)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=31.308793668543096, pvalue=0.03733077394442036)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=24.609169025374687, pvalue=0.1738129806878052)

Learning Rate

In [168]:
lr_df_list = grouped_experiment(
    cluster=cluster_6, query_string=individual_lr_pred, name="lr", diff_hyperparam_fn=individual_lr_diff
)
(140, 17)
In [169]:
plot_deltas(lr_df_list, treatment="learning_rate", multiple_of=5, rotation=45)
In [170]:
all_cluster_results[curr_cluster]["learning_rate"] = perform_individual_sfm_analysis(
    lr_df_list,
    treatment="learning_rate",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=True,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.005, highlight_size=200, multiple_of=1),
    statistical_test_dict=st_tests_6

)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 8.634466590619656e-17. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
------ Treatment has no residuals -  Cannot apply Single-factor model -------

Shapiro-Wilk p-value = 1.0. CANNOT REJECT
Residuals probplot: logloss_evaluator__target
/Users/juliano.garcia/miniconda3/envs/boosting/lib/python3.6/site-packages/scipy/stats/morestats.py:1657: UserWarning: Input data for shapiro has range zero. The results may not be accurate.
  warnings.warn("Input data for shapiro has range zero. The results "
Shapiro-Wilk p-value = 5.394156741626009e-16. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene NaN NaN False
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene NaN NaN False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene NaN NaN False
ANOVA test - Metric: auc_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.004 69 0 - - 1
1 Within 0.000 0 - - - -
ANOVA test - Metric: brier_score_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.017 69 0 - - 1
1 Within 0.000 0 - - - -
ANOVA test - Metric: logloss_evaluator__target
Source SS DF MS F p-unc np2
0 learning_rate 0.168 69 0.002 - - 1
1 Within 0.000 0 - - - -
Kruskal-wallis test - Metric: auc_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: brier_score_evaluator__target
--- Not enough Samples in treatment group ---

Kruskal-wallis test - Metric: logloss_evaluator__target
--- Not enough Samples in treatment group ---

Multiple Treatments

Max Depth and Learning Rate

In [171]:
diff_clause = ("diff_max_depth", "diff_lr")
max_depth_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_lr_pred,
    name="max_depth_lr",
    diff_hyperparam_fn=max_depth_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "learning_rate"],
            treatment_name_output="max_depth_lr",
            order_strategy="rank",
        ),
        max_depth_lr_list,
    )
)
(2800, 18)
In [172]:
plot_deltas(max_depth_lr_list, treatment="max_depth_lr", multiple_of=20, rotation=90)
In [173]:
all_cluster_results[curr_cluster]["max_depth_lr"] = perform_individual_sfm_analysis(
    max_depth_lr_list,
    treatment="max_depth_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.5, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 8.691273465281813e-41. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 7.470919066461402e-39. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 1.5974802493302915e-43. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.762 0.99905 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 0.538 1.0 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 0.672 0.999998 True
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=343.5288011629896, pvalue=0.0003351862674642278)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=444.4501908648193, pvalue=6.020948338601525e-12)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=480.2402467089576, pvalue=1.947958201110195e-15)

Max Depth and Num Estimators

In [174]:
diff_clause = ("diff_max_depth", "diff_num_est")
max_depth_num_est_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_num_est_pred,
    name="max_depth_num_est",
    diff_hyperparam_fn=max_depth_num_est_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df,
            columns=["max_depth", "num_estimators"],
            treatment_name_output="max_depth_num_est",
            order_strategy="rank",
        ),
        max_depth_num_est_list,
    )
)
(11200, 18)
In [175]:
plot_deltas(max_depth_num_est_list, treatment="max_depth_num_est", multiple_of=20, rotation=90)
In [176]:
all_cluster_results[curr_cluster]["max_depth_num_est"] = perform_individual_sfm_analysis(
    max_depth_num_est_list,
    treatment="max_depth_num_est",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1, highlight_size=30, multiple_of=20, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 6.459985920537407e-43. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.982 0.624875 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 2.4 9.233229e-71 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 3.428 3.935870e-148 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=308.4987603835679, pvalue=0.9997299289361279)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=979.7819367341262, pvalue=1.0145057100151009e-50)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=1925.6932756193842, pvalue=5.38352391645615e-198)

Num Estimators and Learning Rate

In [177]:
diff_clause = ("diff_num_est", "diff_lr")
num_est_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=num_est_lr_pred,
    name="num_est_lr",
    diff_hyperparam_fn=num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["num_estimators", "learning_rate"],
            treatment_name_output="num_est_lr",
            order_strategy="rank",
        ),
        num_est_lr_list,
    )
)
(2800, 18)
In [178]:
plot_deltas(num_est_lr_list, treatment="num_est_lr", multiple_of=15, rotation=90)
In [179]:
all_cluster_results[curr_cluster]["num_est_lr"] = perform_individual_sfm_analysis(
    num_est_lr_list,
    treatment="num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=.7, highlight_size=50, multiple_of=15, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 1.401298464324817e-45. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 1.9165559096570523e-41. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.668 0.999997 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.055 0.286383 True
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 2.225 6.187968e-16 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=305.84677365628073, pvalue=0.0025754266802920523)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=312.90593258396194, pvalue=0.00107064557981089)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=354.1822196959263, pvalue=2.237852042493134e-06)

Max Depth, Num Estimators and Learning Rate

In [299]:
diff_clause = ("diff_max_depth", "diff_num_est", "diff_lr")
max_depth_num_est_lr_list = grouped_experiment(
    cluster=cluster_6,
    query_string=max_depth_num_est_lr_pred,
    name="max_depth_num_est_lr",
    diff_hyperparam_fn=max_depth_num_est_lr_diff,
    diff_clause=diff_clause,
)

# create new treatment for all results
max_depth_num_est_lr_list = list(
    map(
        lambda df: create_multiple_treatment_column(
            df=df.assign(learning_rate=lambda df: round(df.learning_rate, 3)),
            columns=["max_depth", "num_estimators", "learning_rate"],
            treatment_name_output="max_depth_num_est_lr",
            order_strategy="rank",
        ),
        max_depth_num_est_lr_list,
    )
)
(56000, 19)
In [300]:
plot_deltas(max_depth_num_est_lr_list, treatment="max_depth_num_est_lr", scale=.7, multiple_of=400, cap_outliers=True, rotation=90, fontsize=15)
In [301]:
all_cluster_results[curr_cluster]["max_depth_num_est_lr"] = perform_individual_sfm_analysis(
    max_depth_num_est_lr_list,
    treatment="max_depth_num_est_lr",
    run_residuals=True,
    run_homoscedasticity=True,
    run_anova=False,
    run_kruskal=True,
    run_kruskal_posthoc=False,
    sfm_plot_args=dict(residuals_width=1.0, highlight_size=30, multiple_of=400, fontsize=15, rotation=90),
    statistical_test_dict=st_tests_6
)
Shapiro-Wilk test for normality - A p-value less than 0.05 indicates that the sample is unlikely to come from a normal distribution

Kruskal-Wallis test for treatment mean equality - With p-value less than 0.05 we reject the hypothesis that the means are equal
Residuals probplot: auc_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: brier_score_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY
Residuals probplot: logloss_evaluator__target
Shapiro-Wilk p-value = 0.0. REJECT NORMALITY

Homoscedasticity - Metric: auc_evaluator__target
W pval equal_var
levene 0.351 1.0 True
Homoscedasticity - Metric: brier_score_evaluator__target
W pval equal_var
levene 1.222 5.205595e-22 False
Homoscedasticity - Metric: logloss_evaluator__target
W pval equal_var
levene 5.606 0.0 False
Kruskal-wallis test - Metric: auc_evaluator__target
KruskalResult(statistic=7851.294316177966, pvalue=2.705704100964987e-150)

Kruskal-wallis test - Metric: brier_score_evaluator__target
KruskalResult(statistic=4966.176739551213, pvalue=0.06798990455329582)

Kruskal-wallis test - Metric: logloss_evaluator__target
KruskalResult(statistic=7007.331698082554, pvalue=6.851410919346193e-86)
In [302]:
del cluster_6
gc.collect()
Out[302]:
4991574

Significance proportion

In [193]:
# open("all_cluster_results.pkl", "wb").write(cloudpickle.dumps(all_cluster_results))
all_cluster_results = cloudpickle.load(open("all_cluster_results.pkl", "rb"))
all_cluster_results.keys()
Out[193]:
dict_keys([1, 2, 3, 4, 5, 6])
In [274]:
### mapping dicts
hyperparam_combination_map = dict(
    num_estimators="NE",
    max_depth="MD",
    learning_rate="LR",
    max_depth_lr="MD, LR",
    num_est_lr="LR, NE",
    max_depth_num_est="MD, NE",
    max_depth_num_est_lr="NE, MD, LR",
)

perf_metric_map = dict(auc="AUC", brier="Brier", logloss="Logloss")

### ordering dicts
hyperparam_combination_order = dict(
    num_estimators=1,
    max_depth=2,
    learning_rate=3,
    max_depth_lr=4,
    max_depth_num_est=5,
    num_est_lr=6,
    max_depth_num_est_lr=7,
)

perf_metric_order = dict(auc=1, brier=2, logloss=2)

Hyperparmeters Significance Proportion

In [198]:
def created_filtered_hyperparam_dict(clusters_results_d, key="num_estimators"):
    results = {}
    for cluster, c_dict in clusters_results_d.items():    
        curr_filt_dict = fp.keyfilter(lambda el: el == key, c_dict)
        if curr_filt_dict[key] !=  [{}, {}, {}]:
            results[cluster] = curr_filt_dict
    return results

def count_significance(filt_hp_dict, key="num_estimators", metric=None, normalize_by=1):
    if metric is None:
        metric = defaultdict(int)
    for hp_d in filt_hp_dict.values():
        for d in hp_d[key]:
            metric[d.get("metric", "NaN")] += 1
    
    return fp.valmap(lambda v: (v/normalize_by)*100, metric)
            
# filt_ne_dict = created_filtered_hyperparam_dict(all_cluster_results, key="num_estimators")
# each num_est analysis have all three metrics. dividing the count of each metric by six will give us the proportion of
# how much that metric was considered significant according to kruskll-wallis
# count_significance(filt_ne_dict, key="num_estimators", normalize_by=6)
In [199]:
for key in sorted(all_cluster_results[1].keys(), key=lambda x: x.count("_")):
    print(f"-------- {key} by metric proportion ---------")
    count_significance(
        created_filtered_hyperparam_dict(all_cluster_results, key=key),
        key=key, normalize_by=6
    )
-------- num_estimators by metric proportion ---------
Out[199]:
{'NaN': 66.66666666666666,
 'logloss_evaluator__target': 50.0,
 'auc_evaluator__target': 16.666666666666664,
 'brier_score_evaluator__target': 16.666666666666664}
-------- max_depth by metric proportion ---------
Out[199]:
{'NaN': 66.66666666666666,
 'brier_score_evaluator__target': 83.33333333333334,
 'logloss_evaluator__target': 66.66666666666666,
 'auc_evaluator__target': 33.33333333333333}
-------- learning_rate by metric proportion ---------
Out[199]:
{'auc_evaluator__target': 50.0,
 'brier_score_evaluator__target': 50.0,
 'logloss_evaluator__target': 50.0}
-------- max_depth_lr by metric proportion ---------
Out[199]:
{'auc_evaluator__target': 83.33333333333334,
 'brier_score_evaluator__target': 100.0,
 'logloss_evaluator__target': 100.0,
 'NaN': 16.666666666666664}
-------- num_est_lr by metric proportion ---------
Out[199]:
{'NaN': 33.33333333333333,
 'brier_score_evaluator__target': 100.0,
 'logloss_evaluator__target': 100.0,
 'auc_evaluator__target': 66.66666666666666}
-------- max_depth_num_est by metric proportion ---------
Out[199]:
{'NaN': 83.33333333333334,
 'logloss_evaluator__target': 66.66666666666666,
 'auc_evaluator__target': 16.666666666666664,
 'brier_score_evaluator__target': 33.33333333333333}
-------- max_depth_num_est_lr by metric proportion ---------
Out[199]:
{'auc_evaluator__target': 83.33333333333334,
 'brier_score_evaluator__target': 83.33333333333334,
 'logloss_evaluator__target': 100.0,
 'NaN': 33.33333333333333}