Продемонстрируем неустойчивость решения, получаемого при помощи деревьев решений, на примере датасета Iris (ирисы Фишера).

Ирисы Фишера состоят из данных о 150 экземплярах ириса, по 50 экземпляров трёх видов: Ирис щетинистый (Iris setosa), Ирис виргинский (Iris virginica) и Ирис разноцветный (Iris versicolor).

Для каждого экземпляра измерялись четыре характеристики (в сантиметрах):

Длина наружной доли околоцветника (англ. sepal length);
Ширина наружной доли околоцветника (англ. sepal width);
Длина внутренней доли околоцветника (англ. petal length);
Ширина внутренней доли околоцветника (англ. petal width).


from sklearn.datasets import load_iris
import pandas as pd

dataset = load_iris()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target != 1  # 0 for setosa, 1 - versicolor, 2 - virginica


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn import tree

# first set of points
x_train1, x_test1, y_train1, y_test1 = train_test_split(
    df[dataset.feature_names], df["target"], random_state=0
)
clf1 = DecisionTreeClassifier(max_depth=3)
clf1.fit(x_train1, y_train1)

# second set of points
x_train2, x_test2, y_train2, y_test2 = train_test_split(
    df[dataset.feature_names], df["target"], random_state=42
)
clf2 = DecisionTreeClassifier(max_depth=3)
clf2.fit(x_train2, y_train2)

fn = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]
cn = ["setosa", "versicolor", "virginica"]
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), dpi=100)
tree.plot_tree(clf1, feature_names=fn, class_names=cn, filled=True, ax=axes[0])
tree.plot_tree(clf2, feature_names=fn, class_names=cn, filled=True, ax=axes[1])
plt.show()


# first set of points
clf1 = DecisionTreeClassifier(max_depth=10, random_state=0)
clf1.fit(x_train1, y_train1)

# second set of points
clf2 = DecisionTreeClassifier(max_depth=10, random_state=42)
clf2.fit(x_train2, y_train2)


fn = ["sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)"]
cn = ["setosa", "versicolor", "virginica"]
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5), dpi=100)
tree.plot_tree(clf1, feature_names=fn, class_names=cn, filled=True, ax=axes[0])
tree.plot_tree(clf2, feature_names=fn, class_names=cn, filled=True, ax=axes[1])
plt.show()


# handson-ml
import numpy as np
from matplotlib.colors import ListedColormap


def plot_decision_boundary(
    clf, x, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.85, contour=True, bolded=False
):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    x_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(x_new).reshape(x1.shape)
    custom_cmap = ListedColormap(["#FEE7D0", "#bea6ff", "#B8E1EC"])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(["#FEE7D0", "#5D5DA6", "#B8E1EC"])
        if bolded:
            custom_cmap2 = ListedColormap(["#FEE7D0", "#5D5DA6", "#000000"])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2)
    plt.plot(x[:, 0][y == 0], x[:, 1][y == 0], "D", c="#F9B041", alpha=alpha)
    plt.plot(x[:, 0][y == 1], x[:, 1][y == 1], "o", c="#2DA9E1", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)


import sklearn

x, y = sklearn.datasets.make_moons(n_samples=500, noise=0.30, random_state=42)

plt.figure(figsize=(8, 6))
plt.plot(x[:, 0][y == 0], x[:, 1][y == 0], "D", c="#F9B041")
plt.plot(x[:, 0][y == 1], x[:, 1][y == 1], "o", c="#2DA9E1")
plt.show()


x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

plt.figure(figsize=(8, 6))
clf = DecisionTreeClassifier(max_depth=20, random_state=42)
clf.fit(x_train, y_train)
plot_decision_boundary(clf, x, y)
plt.title("Decision border", fontsize=14)
plt.show()


# first set of points
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, random_state=1)
clf1 = DecisionTreeClassifier(max_depth=20, random_state=42)
clf1.fit(x_train1, y_train1)

# second set of points
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, random_state=2)
clf2 = DecisionTreeClassifier(max_depth=20, random_state=42)
clf2.fit(x_train2, y_train2)

plt.figure(figsize=(16, 6))
plt.subplot(121)
plot_decision_boundary(clf1, x, y)
plt.title("Decision border 1", fontsize=14)
plt.subplot(122)
plot_decision_boundary(clf2, x, y)
plt.title("Decision border 2", fontsize=14)
plt.show()


# first set of points
clf1 = DecisionTreeClassifier(max_depth=1, random_state=42)
clf1.fit(x_train1, y_train1)

# second set of points
clf2 = DecisionTreeClassifier(max_depth=1, random_state=42)
clf2.fit(x_train2, y_train2)

plt.figure(figsize=(16, 6))
plt.subplot(121)
plot_decision_boundary(clf1, x, y)
plt.title("Decision border 1", fontsize=14)
plt.subplot(122)
plot_decision_boundary(clf2, x, y)
plt.title("Decision border 2", fontsize=14)
plt.show()


# first set of points
clf1 = DecisionTreeClassifier(max_depth=2, random_state=42)
clf1.fit(x_train1, y_train1)

# second set of points
clf2 = DecisionTreeClassifier(max_depth=2, random_state=42)
clf2.fit(x_train2, y_train2)

plt.figure(figsize=(16, 6))
plt.subplot(121)
plot_decision_boundary(clf1, x, y)
plt.title("Decision border 1", fontsize=14)
plt.subplot(122)
plot_decision_boundary(clf2, x, y)
plt.title("Decision border 2", fontsize=14)
plt.show()


x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

clf1 = DecisionTreeClassifier(max_depth=2, random_state=42)
clf1.fit(x_train, y_train)

clf2 = DecisionTreeClassifier(max_depth=20, random_state=42)
clf2.fit(x_train, y_train)

plt.figure(figsize=(16, 6))
plt.subplot(121)
plot_decision_boundary(clf1, x_train, y_train)
plt.title("Decision border, depth=2, train only", fontsize=14)
plt.subplot(122)
plot_decision_boundary(clf2, x_train, y_train)
plt.title("Decision border, depth=20, train only", fontsize=14)
plt.show()


np.random.seed(42)

num_points = 300
num_grid = 500
x_max = 3.14
plt.figure(figsize=(10, 6))


def get_sample(num_points, x_max, std=0.3, x_sample=None):
    if x_sample is None:
        x_sample = (np.random.rand(num_points) - 0.5) * 2 * x_max
    y_sample = np.cos(x_sample.flatten()) + np.random.randn(x_sample.shape[0]) * std
    return x_sample.reshape(-1, 1), y_sample


x_grid = np.linspace(-x_max, x_max, num_grid).reshape(-1, 1)
x_sample, y_sample = get_sample(num_points=num_points, x_max=x_max)
_, y_true = get_sample(num_points=num_points, x_max=x_max, std=0, x_sample=x_grid)

plt.scatter(x_sample, y_sample, c="#bea6ff", label="Noise")
plt.plot(x_grid, y_true, "b--", linewidth=4, label="Real func")
plt.xlabel("X")
plt.ylabel("Y")
plt.legend()
plt.show()


from sklearn.neighbors import KNeighborsRegressor

np.random.seed(42)

num_points = 30
num_models = 3
plt.figure(figsize=(24, 6))

model = KNeighborsRegressor(n_neighbors=1)
y_pred = np.zeros((num_models, num_grid))
sample_color = ["#00E134", "#FF9100", "#FF00B3"]
for model_num in range(num_models):
    x_sample, y_sample = get_sample(num_points=num_points, x_max=x_max)
    model.fit(x_sample, y_sample)
    y_pred[model_num] = model.predict(x_grid)
    _, y_true = get_sample(num_points=num_points, x_max=x_max, std=0, x_sample=x_grid)

    plt.subplot(1, 3, model_num + 1)
    plt.scatter(
        x_sample, y_sample, c=sample_color[model_num], label=f"sample {model_num+1}"
    )
    plt.plot(
        x_grid,
        y_pred[model_num],
        c=sample_color[model_num],
        alpha=0.8,
        label=f"model trained on sample {model_num+1}",
    )
    plt.plot(x_grid, y_true, "b--", linewidth=4, label="real mean")
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.ylim(-1.5, 1.8)
    plt.legend(loc="lower center")


import matplotlib.gridspec as gridspec

num_models = 1000

for n_neighbors in [1, 3, 25]:
    model = KNeighborsRegressor(n_neighbors=n_neighbors)

    y_pred = np.zeros((num_models, num_grid))

    plt.figure(figsize=(10, 4))
    gs = gridspec.GridSpec(1, 2, width_ratios=[2, 1])
    plt.subplot(gs[0])

    for model_num in range(num_models):
        x_sample, y_sample = get_sample(num_points=num_points, x_max=x_max)
        model.fit(x_sample, y_sample)
        y_pred[model_num] = model.predict(x_grid)
        plt.plot(x_grid, y_pred[model_num], alpha=0.01, c="g", linewidth=5)

    _, y_true = get_sample(num_points=num_points, x_max=x_max, std=0, x_sample=x_grid)
    plt.plot(x_grid, y_true, c="b", linewidth=3, label="real mean")
    plt.axvline(x=x_grid[num_grid // 2], c="r", linewidth=1, label="X text point")
    plt.xlim((-x_max, x_max))
    plt.ylim((-1, 2))
    plt.xlabel("X")
    plt.ylabel("Y")
    plt.gca().set_title(f"{num_models} models: {n_neighbors} nearest neighbours")
    plt.legend(loc="upper right")

    plt.subplot(gs[1])
    var = y_pred[:, num_grid // 2].var()
    bias = np.abs(y_true[num_grid // 2] - y_pred[:, num_grid // 2].mean())
    plt.hist(
        y_pred[:, num_grid // 2],
        bins=15,
        color="g",
        alpha=0.5,
        orientation="horizontal",
        label=f"predictions: \nvar = {var:.2f}\nbias = {bias:.2f}",
    )
    plt.axhline(y=y_true[num_grid // 2], c="b", linewidth=3, label="real mean")
    plt.ylim((-1, 2))
    plt.xlabel("hist counts")
    plt.ylabel("Y")
    plt.gca().set_title(f"predictions at test point")
    plt.legend(loc="upper left")
    plt.tight_layout()
    plt.show()


plt.figure(figsize=(8, 6))

for i in range(1, 101):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=i)
    clf = DecisionTreeClassifier(max_depth=20, random_state=0)
    clf.fit(x_train, y_train)
    plot_decision_boundary(clf, x, y, alpha=0.02, contour=False)
plt.show()


import pandas as pd
from sklearn.datasets import load_iris

dataset = load_iris()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df["target"] = dataset.target != 1


import scipy.stats

cor_value, pval = scipy.stats.pearsonr(df["sepal length (cm)"], df["petal length (cm)"])
print(f"Correlation coefficient is {cor_value:.3f}")
print(f"P-value is {pval:.2e}")

Correlation coefficient is 0.872
P-value is 1.04e-47


?scipy.stats.pearsonr


import numpy as np


def pearsonr_ci(x, y, alpha=0.05):
    """calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    """

    r, p = scipy.stats.pearsonr(x, y)
    r_z = np.arctanh(r)
    se = 1 / np.sqrt(x.size - 3)
    z = scipy.stats.norm.ppf(1 - alpha / 2)
    lo_z, hi_z = r_z - z * se, r_z + z * se
    lo, hi = np.tanh((lo_z, hi_z))
    return lo, hi


lo, hi = pearsonr_ci(df["sepal length (cm)"], df["petal length (cm)"])

print(f"Lower bound of confidence interval: {lo:.3f}")
print(f"Upper bound of confidence interval: {hi:.3f}")

Lower bound of confidence interval: 0.827
Upper bound of confidence interval: 0.906


def bootstrap_metric(x, y, metric_fn, samples_cnt=1000, random_state=42):
    np.random.seed(random_state)
    b_metric = np.zeros(samples_cnt)
    for it in range(samples_cnt):
        poses = np.random.choice(x.shape[0], size=x.shape[0], replace=True)

        x_boot = x[poses]
        y_boot = y[poses]
        m_val = metric_fn(x_boot, y_boot)
        b_metric[it] = m_val

    return b_metric


import matplotlib.pyplot as plt
import seaborn as sns

boot_cor = bootstrap_metric(
    x=df["sepal length (cm)"],
    y=df["petal length (cm)"],
    metric_fn=lambda x, y: scipy.stats.pearsonr(x, y)[0],
)

# plot histogram of the obtained values:
plt.figure(figsize=(10, 6))
sns.histplot(boot_cor)
plt.show()


alpha = 0.05
lo_2, hi_2 = np.quantile(boot_cor, q=[alpha / 2, 1 - alpha / 2])

print(f"Lower bound of confidence interval: {lo:.3f}, bootstap: {lo_2:.3f}")
print(f"Upper bound of confidence interval: {hi:.3f}, bootstap: {hi_2:.3f}")

Lower bound of confidence interval: 0.827, bootstap: 0.836
Upper bound of confidence interval: 0.906, bootstap: 0.903


heart_dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/heart.csv"
)


lo, hi = pearsonr_ci(heart_dataset["age"], heart_dataset["chol"])

print(f"Lower bound of confidence interval: {lo:.3f}")
print(f"Upper bound of confidence interval: {hi:.3f}")

Lower bound of confidence interval: 0.103
Upper bound of confidence interval: 0.319


cor_value, p_value = scipy.stats.pearsonr(heart_dataset["age"], heart_dataset["chol"])

print(f"Correlation coefficient is: {cor_value:.3f}")
print(f"P-value: {p_value:.2e}")

Correlation coefficient is: 0.214
P-value: 1.79e-04


boot_cor = bootstrap_metric(
    x=heart_dataset["age"],
    y=heart_dataset["chol"],
    metric_fn=lambda x, y: scipy.stats.pearsonr(x, y)[0],
)
lo_2, hi_2 = np.quantile(boot_cor, q=[alpha / 2, 1 - alpha / 2])

print(f"Lower bound of confidence interval(bootstap): {lo_2:.3f}")
print(f"Upper bound of confidence interval(bootstap): {hi_2:.3f}")

Lower bound of confidence interval(bootstap): 0.101
Upper bound of confidence interval(bootstap): 0.313


size = 1500
y = np.random.choice([0, 1], size=size, replace=True)
print(f"shape y: {y.shape}")
print(f"First 10 values: {y[0:10]}")

shape y: (1500,)
First 10 values: [0 0 0 0 0 1 1 1 1 0]


def guess_model(y_real, p):
    guessed = np.random.choice([True, False], size=size, replace=True, p=[p, 1 - p])
    y_pred = np.zeros_like(y_real)
    y_pred[guessed] = y_real[guessed]
    y_pred[~guessed] = 1 - y_real[~guessed]
    return y_pred


model1 = lambda y: guess_model(y, p=0.7)
model2 = lambda y: guess_model(y, p=0.7)
model3 = lambda y: guess_model(y, p=0.75)

np.random.seed(42)
y_pred1 = model1(y)
y_pred2 = model2(y)
y_pred3 = model3(y)


from sklearn.metrics import f1_score

qual1 = f1_score(y_true=y, y_pred=y_pred1)
qual2 = f1_score(y_true=y, y_pred=y_pred2)
qual3 = f1_score(y_true=y, y_pred=y_pred3)

print(f" qual1: {qual1:.3f}\n qual2: {qual2:.3f}\n qual3: {qual3:.3f}")

 qual1: 0.694
 qual2: 0.698
 qual3: 0.765


print(f"qual2 - qual1: {(qual2 - qual1):.3f}\nqual3 - qual2: {(qual3 - qual2):.3f}")

qual2 - qual1: 0.004
qual3 - qual2: 0.067


boot_f1score_m1 = bootstrap_metric(
    y, y_pred1, metric_fn=lambda x, y: f1_score(y_true=x, y_pred=y)
)
boot_f1score_m2 = bootstrap_metric(
    y, y_pred2, metric_fn=lambda x, y: f1_score(y_true=x, y_pred=y)
)
boot_f1score_m3 = bootstrap_metric(
    y, y_pred3, metric_fn=lambda x, y: f1_score(y_true=x, y_pred=y)
)


alpha = 0.10
print(
    "F1 score for the 1st model: ",
    np.quantile(boot_f1score_m1, q=[alpha / 2, 1 - alpha / 2]),
)
print(
    "F1 score for the 2st model: ",
    np.quantile(boot_f1score_m2, q=[alpha / 2, 1 - alpha / 2]),
)
print(
    "F1 score for the 3st model: ",
    np.quantile(boot_f1score_m3, q=[alpha / 2, 1 - alpha / 2]),
)

F1 score for the 1st model:  [0.6706605  0.71665785]
F1 score for the 2st model:  [0.67613252 0.71965134]
F1 score for the 3st model:  [0.74469469 0.78326985]


plt.figure(figsize=(16, 6))
sns.boxplot(
    y=np.concatenate([boot_f1score_m1, boot_f1score_m2, boot_f1score_m3]),
    x=["model1"] * 1000 + ["model2"] * 1000 + ["model3"] * 1000,
)
plt.ylabel("f1 score", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


from sklearn.model_selection import train_test_split

x = heart_dataset.drop("target", axis=1)
y = heart_dataset["target"] > 0
x_train, x_test, y_train, y_test = train_test_split(x, y.values, random_state=42)


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

svc_model = GridSearchCV(
    SVC(), {"kernel": ("linear", "rbf"), "C": [0.01, 0.1, 1, 10]}
).fit(x_train, y_train)

logr_model = GridSearchCV(
    LogisticRegression(solver="liblinear", max_iter=100000),
    {"penalty": ("l1", "l2"), "C": [0.01, 0.1, 1, 10, 100]},
).fit(x_train, y_train)

# few objects in the leaf - poor estimates of class probabilities - the model is overfitting
dt_model = GridSearchCV(
    DecisionTreeClassifier(),
    {"max_depth": [1, 3, 5, 7, 10], "min_samples_leaf": [1, 3, 5, 10]},
).fit(x_train, y_train)


from sklearn.metrics import average_precision_score  # PR-AUC

y_pred1 = svc_model.decision_function(
    x_test
)  # by default, SVM gives score to each object instead of probabilities
y_pred2 = logr_model.predict_proba(x_test)[:, 1]
y_pred3 = dt_model.predict_proba(x_test)[:, 1]

qual1 = average_precision_score(y_true=y_test, y_score=y_pred1)
qual2 = average_precision_score(y_true=y_test, y_score=y_pred2)
qual3 = average_precision_score(y_true=y_test, y_score=y_pred3)


print(f"Logistic regression pr-auc: {qual1:.03f}")
print(f"SVC pr-auc: {qual2:.03f}")
print(f"DecisionTreeClassifier pr-auc: {qual3:.03f}")

Logistic regression pr-auc: 0.893
SVC pr-auc: 0.902
DecisionTreeClassifier pr-auc: 0.801


boot_score_logreg = bootstrap_metric(
    y_test, y_pred1, metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y)
)
boot_score_svc = bootstrap_metric(
    y_test, y_pred2, metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y)
)
boot_score_dt = bootstrap_metric(
    y_test, y_pred3, metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y)
)

alpha = 0.10
print(
    "Logistic regression pr-auc 90%-ci: ",
    np.quantile(boot_score_logreg, q=[alpha / 2, 1 - alpha / 2]),
)
print("SVC pr-auc 90%-ci:", np.quantile(boot_score_svc, q=[alpha / 2, 1 - alpha / 2]))
print(
    "DecisionTreeClassifier pr-auc 90%-ci:",
    np.quantile(boot_score_dt, q=[alpha / 2, 1 - alpha / 2]),
)

Logistic regression pr-auc 90%-ci:  [0.80793003 0.97363727]
SVC pr-auc 90%-ci: [0.8276533  0.97115084]
DecisionTreeClassifier pr-auc 90%-ci: [0.70049707 0.90513571]


plt.figure(figsize=(16, 6))
sns.boxplot(
    y=np.concatenate([boot_score_logreg, boot_score_svc, boot_score_dt]),
    x=["Log-reg"] * 1000 + ["SVC"] * 1000 + ["DT"] * 1000,
)
plt.ylabel("PR-AUC", size=20)
plt.xlabel("Base models", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


import numpy as np


def get_signal(size, random_state=42):
    signal = np.random.choice([0, 1], size, replace=True)
    return signal


def compare_signs(sig1, sig2):
    return (sig1 != sig2).sum()


def add_noise(sig, noise_p=0.20):
    sig = sig.copy()
    changed = np.random.choice(
        [True, False], sig.shape[0], replace=True, p=[noise_p, 1 - noise_p]
    )
    sig[changed] = 1 - sig[changed]
    return sig


def average_signals(sigs):
    sig = np.mean(sigs, axis=0)
    sig = np.round(sig, 0)
    return sig


def send_signal(signal, tries):
    passed_sigs = [add_noise(signal) for _ in range(tries)]
    fin_signal = average_signals(passed_sigs)
    return fin_signal


import matplotlib.pyplot as plt

np.random.seed(42)
repeats = 1000
signals_cnt_rng = range(1, 30, 2)

signal = get_signal(10)
mistakes = np.zeros((repeats, len(signals_cnt_rng)))

for j, sig_cnt in enumerate(signals_cnt_rng):
    for i in range(repeats):
        rec_sig = send_signal(signal, sig_cnt)
        mistakes[i, j] = compare_signs(rec_sig, signal)


mn = mistakes.mean(axis=0)
sd = mistakes.std(axis=0)
plt.figure(figsize=(10, 5))
plt.title("Number of error in signal", fontsize=14)
plt.ylabel("Number of errors", fontsize=14)
plt.xlabel("Number of signals passed at once", fontsize=14)
plt.plot(signals_cnt_rng, mn)
plt.fill_between(signals_cnt_rng, mn - sd, mn + sd, facecolor="blue", alpha=0.1)
plt.show()


def get_predictions(y_real, p, cnt):
    size = y_real.shape[0]
    guessed = np.random.choice([True, False], (cnt, size), p=[p, 1 - p])
    y = np.repeat(y_real.reshape(1, -1), cnt, axis=0)
    y[~guessed] = 1 - y[~guessed]
    return y


import pandas as pd
import seaborn as sns

size = 1000
reps = 10

cnt_base_predictors = [1] + list(range(5, 105, 5))
single_qual = [0.45, 0.5, 0.51, 0.55, 0.6, 0.75, 0.9]

dt = {"cnt": [], "single_qual": [], "accuracy": []}

for i in range(reps):
    y_real = np.random.choice([0, 1], size)
    for cnt in cnt_base_predictors:
        for p in single_qual:
            preds = get_predictions(y_real, p, cnt)
            voting = np.round(preds.mean(axis=0))
            accuracy = (y_real == voting).mean()
            dt["cnt"].append(cnt)
            dt["single_qual"].append(f"{p:.02}")
            dt["accuracy"].append(accuracy)

results = pd.DataFrame(dt)

plt.figure(figsize=(16, 6))

sns.lineplot(data=results, x="cnt", y="accuracy", hue="single_qual", lw=3, alpha=0.5)
plt.xlabel("Number of base classifiers", size=20)
plt.ylabel("Accuracy", size=20)
plt.legend(loc="best", fontsize=12, title="Single classifier quality")
plt.show()


import scipy


def get_correlated_predictions(y_real, p, cnt, r):
    size = y_real.shape[0]
    x1 = np.random.uniform(0, 1, size)
    x2 = np.random.uniform(0, 1, (cnt, size))
    q = np.sqrt(r)
    y = (
        q * x1 + (1 - q**2) ** 0.5 * x2
    )  # y variables now correlated with correlation=r
    y_mod = np.zeros_like(y)
    for i in range(y.shape[0]):
        y_mod[i] = scipy.stats.rankdata(y[i])

    y = y_mod / size  # back to uniform, slightly affects correlations

    y_pred = np.repeat(y_real.reshape(1, -1), cnt, axis=0)
    y_pred[y < 1 - p] = 1 - y_pred[y < 1 - p]  # to predictions, affects correlations
    return y_pred


np.random.seed(42)
x = np.arange(0, 1, 0.05)
accuracy = np.zeros_like(x)
p = 0.7
cnt = 100
for ind, r in enumerate(x):
    preds = get_correlated_predictions(y_real, p, cnt, r)
    voting = np.round(preds.mean(axis=0))
    accuracy[ind] = (y_real == voting).mean()

plt.figure(figsize=(16, 6))
plt.title(f"Accuracy of {cnt} classifiers ensemble", size=20)
plt.xlabel("Correlation among classifiers", size=20)
plt.ylabel("Accuracy", size=20)
plt.axhline(y=p, color="red", lw=5, ls="--", label="Single classifier")
sns.lineplot(x=x, y=accuracy, lw=5, label="Ensemble")
plt.legend(fontsize=20)
plt.show()


import sklearn


def get_bootstrap_sample(x, y):
    size = x.shape[0]
    poses = np.random.choice(size, size=size, replace=True)
    x_boot = x[poses]
    y_boot = y[poses]
    return x_boot, y_boot


class BaggingBinaryClassifierEnsemble:
    def __init__(self, base_classifier, ensemble_size, random_state=42):
        self.base_classifier = base_classifier
        self.ensemble_size = ensemble_size
        self.random_state = random_state
        self.ensemble = []

    def fit(self, x, y):
        np.random.seed(self.random_state)
        for est_id in range(self.ensemble_size):
            x_boot, y_boot = get_bootstrap_sample(x, y)
            model = sklearn.clone(self.base_classifier)  # create new base model
            model.fit(x_boot, y_boot)
            self.ensemble.append(model)

    def predict_proba(self, x):
        if not self.ensemble:
            raise Exception("Unfitted model")

        y_pred = 0
        for est in self.ensemble:
            y_pred += est.predict(x)
        y_pred = y_pred / self.ensemble_size
        return y_pred

    def predict(self, x):
        y_proba = self.predict_proba(x)
        y_pred = np.round(y_proba)
        return y_pred


heart_dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/heart.csv"
)


from sklearn.model_selection import train_test_split

x = heart_dataset.drop("target", axis=1)
y = heart_dataset["target"] > 0
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

svc_model = GridSearchCV(
    SVC(), {"kernel": ("linear", "rbf"), "C": [0.01, 0.1, 1, 10]}
).fit(x_train, y_train)

logr_model = GridSearchCV(
    LogisticRegression(solver="liblinear", max_iter=100000),
    {"penalty": ("l1", "l2"), "C": [0.01, 0.1, 1, 10, 100]},
).fit(x_train, y_train)

# few objects in the leaf - poor estimates of class probabilities - the model is overtraining
dt_model = GridSearchCV(
    DecisionTreeClassifier(),
    {"max_depth": [1, 3, 5, 7, 10], "min_samples_leaf": [1, 3, 5, 10]},
).fit(x_train, y_train)


bagging_dt = BaggingBinaryClassifierEnsemble(
    dt_model.best_estimator_, ensemble_size=100
)

bagging_logreg = BaggingBinaryClassifierEnsemble(
    logr_model.best_estimator_, ensemble_size=100
)

bagging_svc = BaggingBinaryClassifierEnsemble(
    svc_model.best_estimator_, ensemble_size=100
)


bagging_dt.fit(x_train.values, y_train.values)
bagging_logreg.fit(x_train.values, y_train.values)
bagging_svc.fit(x_train.values, y_train.values)


y_pred_blr = bagging_logreg.predict_proba(x_test.values)
y_pred_bsvc = bagging_svc.predict_proba(x_test.values)
y_pred_bdt = bagging_dt.predict_proba(x_test.values)


from sklearn.metrics import average_precision_score  # PR-AUC

qual_blr = average_precision_score(y_true=y_test, y_score=y_pred_blr)
qual_bsvc = average_precision_score(y_true=y_test, y_score=y_pred_bsvc)
qual_bdt = average_precision_score(y_true=y_test, y_score=y_pred_bdt)
print(f"Bagged Logistic regression pr-auc: {qual_blr:.03f}")
print(f"Bagged SVC pr-auc: {qual_bsvc :.03f}")
print(f"Bagged DecisionTreeClassifier pr-auc: {qual_bdt:.03f}")

Bagged Logistic regression pr-auc: 0.872
Bagged SVC pr-auc: 0.900
Bagged DecisionTreeClassifier pr-auc: 0.898


def bootstrap_metric(x, y, metric_fn, samples_cnt=1000, alpha=0.05, random_state=42):
    size = len(x)
    np.random.seed(random_state)
    b_metric = np.zeros(samples_cnt)
    for it in range(samples_cnt):
        poses = np.random.choice(x.shape[0], size=x.shape[0], replace=True)

        x_boot = x[poses]
        y_boot = y[poses]

        m_val = metric_fn(x_boot, y_boot)
        b_metric[it] = m_val

    return b_metric


boot_score_blogreg = bootstrap_metric(
    y_test.values,
    y_pred_blr,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)
boot_score_bsvc = bootstrap_metric(
    y_test.values,
    y_pred_bsvc,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)
boot_score_bdt = bootstrap_metric(
    y_test.values,
    y_pred_bdt,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)
alpha = 0.10
print(
    "Bagged Logistic regression pr-auc 90%-ci: ",
    np.quantile(boot_score_blogreg, q=[alpha / 2, 1 - alpha / 2]),
)
print(
    "Bagged SVC pr-auc 90%-ci:",
    np.quantile(boot_score_bsvc, q=[alpha / 2, 1 - alpha / 2]),
)
print(
    "Bagged DecisionTreeClassifier pr-auc 90%-ci:",
    np.quantile(boot_score_bdt, q=[alpha / 2, 1 - alpha / 2]),
)

Bagged Logistic regression pr-auc 90%-ci:  [0.78262498 0.95069418]
Bagged SVC pr-auc 90%-ci: [0.81964811 0.9761168 ]
Bagged DecisionTreeClassifier pr-auc 90%-ci: [0.83053272 0.95963424]


y_pred1 = svc_model.decision_function(
    x_test
)  # by default, SVM gives score to each object instead of probabilities
y_pred2 = logr_model.predict_proba(x_test)[:, 1]
y_pred3 = dt_model.predict_proba(x_test)[:, 1]

qual1 = average_precision_score(y_true=y_test, y_score=y_pred1)
qual2 = average_precision_score(y_true=y_test, y_score=y_pred2)
qual3 = average_precision_score(y_true=y_test, y_score=y_pred3)


boot_score_logreg = bootstrap_metric(
    y_test.values,
    y_pred1,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)
boot_score_svc = bootstrap_metric(
    y_test.values,
    y_pred2,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)
boot_score_dt = bootstrap_metric(
    y_test.values,
    y_pred3,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)


plt.figure(figsize=(11, 6))

result_arrays = [
    boot_score_logreg,
    boot_score_svc,
    boot_score_dt,
    boot_score_blogreg,
    boot_score_bsvc,
    boot_score_bdt,
]
base_models = ["Log-Reg", "SVC", "DT"] * 2
ensemble_types = ["Single"] * 3 + ["Bagged"] * 3

dfs = []
for i, res in enumerate(result_arrays):
    df = pd.DataFrame(res, columns=["pr_auc"])
    df["base_model"] = base_models[i]
    df["ensemble_method"] = ensemble_types[i]
    dfs.append(df)

sns.boxplot(data=pd.concat(dfs), y="pr_auc", x="base_model", hue="ensemble_method")
plt.xlabel("Base models", size=20)
plt.ylabel("PR-AUC", size=20)
plt.legend(fontsize=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


from sklearn import datasets

x, y = sklearn.datasets.make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


from matplotlib.colors import ListedColormap


def plot_decision_boundary(
    clf, x, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.85, contour=True, bolded=False
):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    x_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(x_new).reshape(x1.shape)
    custom_cmap = ListedColormap(["#FEE7D0", "#bea6ff", "#B8E1EC"])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(["#FEE7D0", "#5D5DA6", "#B8E1EC"])
        if bolded:
            custom_cmap2 = ListedColormap(["#FEE7D0", "#5D5DA6", "#000000"])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2)
    plt.plot(x[:, 0][y == 0], x[:, 1][y == 0], "D", c="#F9B041", alpha=alpha)
    plt.plot(x[:, 0][y == 1], x[:, 1][y == 1], "o", c="#2DA9E1", alpha=alpha)
    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)


plt.figure(figsize=(16, 6))
plt.subplot(121)
clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(x_train, y_train)
plot_decision_boundary(clf, x, y)
plt.title("Single Decision Tree", fontsize=14)

plt.subplot(122)
bagging_dt = BaggingBinaryClassifierEnsemble(
    DecisionTreeClassifier(max_depth=10), ensemble_size=100
)
bagging_dt.fit(x_train, y_train)
plot_decision_boundary(bagging_dt, x, y)
plt.title("Bagged Decision Tree", fontsize=14)
plt.show()


plt.figure(figsize=(16, 6))
plt.subplot(121)
plt.title("Decision borders of DT trained on different train datasets", fontsize=14)
for i in range(100):
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=i)
    clf = DecisionTreeClassifier(max_depth=20, random_state=42)
    clf.fit(x_train, y_train)
    plot_decision_boundary(clf, x, y, alpha=0.02, contour=False)

plt.subplot(122)
plt.title(
    "Decision borders of DT trained on different  bootstrap datasets", fontsize=14
)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
bagging_dt = BaggingBinaryClassifierEnsemble(
    DecisionTreeClassifier(max_depth=10), ensemble_size=100, random_state=42
)

bagging_dt.fit(x_train, y_train)

for base_dt in bagging_dt.ensemble:
    plot_decision_boundary(base_dt, x, y, alpha=0.02, contour=False)


def get_rsm_sample(x, y, f_num=None):
    size = x.shape[1]
    f_num = f_num or int(np.sqrt(size)) + 1
    f_num = min(size, f_num)

    f_poses = np.random.choice(size, size=f_num, replace=False)
    x_rsm = x[:, f_poses]
    y_rsm = y.copy()
    return x_rsm, y_rsm, f_poses


class RSMBinaryClassifierEnsemble:
    def __init__(
        self, base_classifier, ensemble_size, random_state=42, max_features=None
    ):
        self.base_classifier = base_classifier
        self.ensemble_size = ensemble_size
        self.random_state = random_state
        self.max_features = max_features

        self.ensemble = []
        self.feature_poses = []
        # we had to keep track of features selected. In sklearn Random Forest, discussed below,
        # another, more stable implementation is used.
        # they use `f_num` random features but in case no good split found, they try other features too.

    def fit(self, x, y):
        np.random.seed(self.random_state)
        for est_id in range(self.ensemble_size):
            x_boot, y_boot, f_poses = get_rsm_sample(x, y, f_num=self.max_features)
            self.feature_poses.append(f_poses)
            model = sklearn.clone(self.base_classifier)  # create new base model
            model.fit(x_boot, y_boot)
            self.ensemble.append(model)

    def predict_proba(self, x):
        if not self.ensemble:
            raise Exception("Unfitted model")

        y_pred = 0
        for ind, est in enumerate(self.ensemble):
            y_pred += est.predict(x[:, self.feature_poses[ind]])
        y_pred = y_pred / self.ensemble_size
        return y_pred

    def predict(self, x):
        y_proba = self.predict_proba(x)
        y_pred = np.round(y_proba)
        return y_pred


x = heart_dataset.drop("target", axis=1)
y = heart_dataset["target"] > 0
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

rsm_dt = RSMBinaryClassifierEnsemble(dt_model.best_estimator_, ensemble_size=100)

rsm_logreg = RSMBinaryClassifierEnsemble(logr_model.best_estimator_, ensemble_size=100)

rsm_svc = RSMBinaryClassifierEnsemble(svc_model.best_estimator_, ensemble_size=100)


rsm_dt.fit(x_train.values, y_train.values)
rsm_logreg.fit(x_train.values, y_train.values)
rsm_svc.fit(x_train.values, y_train.values)


y_pred_rlr = rsm_logreg.predict_proba(x_test.values)
y_pred_rsvc = rsm_svc.predict_proba(x_test.values)
y_pred_rdt = rsm_dt.predict_proba(x_test.values)


boot_score_rlogreg = bootstrap_metric(
    y_test.values,
    y_pred_rlr,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)

boot_score_rsvc = bootstrap_metric(
    y_test.values,
    y_pred_rsvc,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)

boot_score_rdt = bootstrap_metric(
    y_test.values,
    y_pred_rdt,
    metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
)

alpha = 0.10
print(
    "RSM Logistic regression pr-auc 90%-ci: ",
    np.quantile(boot_score_blogreg, q=[alpha / 2, 1 - alpha / 2]),
)
print(
    "RSM SVC pr-auc 90%-ci:", np.quantile(boot_score_bsvc, q=[alpha / 2, 1 - alpha / 2])
)
print(
    "RSM DecisionTreeClassifier pr-auc 90%-ci:",
    np.quantile(boot_score_bdt, q=[alpha / 2, 1 - alpha / 2]),
)

RSM Logistic regression pr-auc 90%-ci:  [0.78262498 0.95069418]
RSM SVC pr-auc 90%-ci: [0.81964811 0.9761168 ]
RSM DecisionTreeClassifier pr-auc 90%-ci: [0.83053272 0.95963424]


plt.figure(figsize=(14, 6))

result_arrays = [
    boot_score_logreg,
    boot_score_svc,
    boot_score_dt,
    boot_score_blogreg,
    boot_score_bsvc,
    boot_score_bdt,
    boot_score_rlogreg,
    boot_score_rsvc,
    boot_score_rdt,
]
base_models = ["Log-Reg", "SVC", "DT"] * 3
ensemble_types = ["Single"] * 3 + ["Bagged"] * 3 + ["RSM"] * 3

dfs = []
for i, res in enumerate(result_arrays):
    df = pd.DataFrame(res, columns=["pr_auc"])
    df["base_model"] = base_models[i]
    df["ensemble_method"] = ensemble_types[i]
    dfs.append(df)

sns.boxplot(data=pd.concat(dfs), y="pr_auc", x="base_model", hue="ensemble_method")
plt.xlabel("Base model", size=20)
plt.ylabel("PR-AUC", size=20)
plt.legend(fontsize=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


from sklearn.ensemble import BaggingClassifier

models = {}

logr_model = GridSearchCV(
    LogisticRegression(solver="liblinear", max_iter=100000),
    {"penalty": ("l1", "l2"), "C": [0.01, 0.1, 1, 10, 100]},
).fit(x_train, y_train)
models["LogReg"] = logr_model

svc_model = GridSearchCV(
    SVC(), {"kernel": ("linear", "rbf"), "C": [0.01, 0.1, 1, 10]}
).fit(x_train, y_train)
models["SVC"] = svc_model

# few objects in the leaf - poor estimates of class probabilities - the model is overtraining
dt_model = GridSearchCV(
    DecisionTreeClassifier(),
    {"max_depth": [1, 3, 5, 7, 10], "min_samples_leaf": [1, 3, 5, 10]},
).fit(x_train, y_train)
models["DT"] = dt_model

bagging_logr = BaggingClassifier(
    logr_model.best_estimator_, n_estimators=100, random_state=42
)
models["Bagging LogReg"] = bagging_logr

bagging_svc = BaggingClassifier(
    svc_model.best_estimator_, n_estimators=100, random_state=42
)
models["Bagging SVC"] = bagging_svc

bagging_dt = BaggingClassifier(
    dt_model.best_estimator_, n_estimators=100, random_state=42
)
models["Bagging DT"] = bagging_dt

sqrt_features = int(np.sqrt(x.shape[1])) + 1

rsm_logreg = BaggingClassifier(
    logr_model.best_estimator_,
    n_estimators=100,
    bootstrap=False,
    max_features=sqrt_features,
    random_state=42,
)
models["RSM LogReg"] = rsm_logreg

rsm_svc = BaggingClassifier(
    svc_model.best_estimator_,
    n_estimators=100,
    bootstrap=False,
    max_features=sqrt_features,
    random_state=42,
)
models["RSM SVC"] = rsm_svc

rsm_dt = BaggingClassifier(
    dt_model.best_estimator_,
    n_estimators=100,
    bootstrap=False,
    max_features=sqrt_features,
    random_state=42,
)
models["RSM DT"] = rsm_dt

# Both Bagging and RSM
bag_rsm_logreg = BaggingClassifier(
    logr_model.best_estimator_,
    n_estimators=100,
    bootstrap=True,
    max_features=sqrt_features,
    random_state=42,
)
models["BagRSM LogReg"] = bag_rsm_logreg


bag_rsm_svc = BaggingClassifier(
    svc_model.best_estimator_,
    n_estimators=100,
    bootstrap=True,
    max_features=sqrt_features,
    random_state=42,
)
models["BagRSM SVC"] = bag_rsm_svc

bag_rsm_dt = BaggingClassifier(
    dt_model.best_estimator_,
    n_estimators=100,
    bootstrap=True,
    max_features=sqrt_features,
    random_state=42,
)
models["BagRSM DT"] = bag_rsm_dt


for name, model in models.items():
    model.fit(x_train, y_train)


predictions = {}

for name, model in models.items():
    if name != "SVC":
        y_pred = model.predict_proba(x_test)[:, 1]
    else:
        y_pred = model.decision_function(x_test)
    predictions[name] = y_pred


boot_scores = {}

for name, y_pred in predictions.items():
    boot_score = bootstrap_metric(
        y_test.values,
        y_pred,
        metric_fn=lambda x, y: average_precision_score(y_true=x, y_score=y),
    )
    boot_scores[name] = boot_score


plt.figure(figsize=(16, 6))

base_models = ["Log-Reg", "SVC", "DT"] * 4
ensemble_types = ["Single"] * 3 + ["Bagged"] * 3 + ["RSM"] * 3 + ["BagRSM"] * 3

dfs = []
for i, model_name in enumerate(boot_scores):
    df = pd.DataFrame(boot_scores[model_name], columns=["pr_auc"])
    df["base_model"] = base_models[i]
    df["ensemble_method"] = ensemble_types[i]
    dfs.append(df)

sns.boxplot(data=pd.concat(dfs), y="pr_auc", x="base_model", hue="ensemble_method")
plt.xlabel("Base model", size=20)
plt.ylabel("PR-AUC", size=20)
plt.legend(fontsize=20, loc="lower right")
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


import itertools


def base_model_pair_correlation(ensemble, x):
    corrs = []
    for (i, est1), (j, est2) in itertools.combinations(
        enumerate(ensemble.estimators_), 2
    ):
        xi_test = x.values[:, ensemble.estimators_features_[i]]
        xj_test = x.values[:, ensemble.estimators_features_[j]]

        if not isinstance(est1, sklearn.svm.SVC):
            y_pred_t1 = est1.predict_proba(xi_test)[:, 1]

            y_pred_t2 = est2.predict_proba(xj_test)[:, 1]
        else:
            y_pred_t1 = est1.decision_function(xi_test)
            xj_test = x_test.values[:, ensemble.estimators_features_[j]]
            y_pred_t2 = est2.decision_function(xj_test)
        corrs.append(scipy.stats.pearsonr(y_pred_t1, y_pred_t2)[0])
    return np.array(corrs)


pair_correlations = {}
for name, model in models.items():
    if not "Bagging" in name and not "RSM" in name:
        continue
    pair_correlations[name] = base_model_pair_correlation(model, x_test)


cor_res = pd.DataFrame(pair_correlations)
cor_res = cor_res.melt(
    value_vars=cor_res.columns, value_name="paircor", var_name="model"
)


# get base models and ensembling methods from names
def read_base(s):
    if "dt" in s.lower():
        return "DT"
    elif "svc" in s.lower():
        return "SVC"
    else:
        return "Log-Reg"


def read_ensemble(s):
    bag, rsm = False, False
    if "bag" in s.lower():
        bag = True
    if "rsm" in s.lower():
        rsm = True
    if bag and rsm:
        return "BagRSM"
    if bag:
        return "Bagged"
    if rsm:
        return "RSM"
    return "Single"


cor_res["base_model"] = cor_res["model"].apply(read_base)
cor_res["ensemble_method"] = cor_res["model"].apply(read_ensemble)


def base_model_prauc(ensemble, x, y):
    qual = np.zeros(ensemble.n_estimators)
    for ind, est in enumerate(ensemble.estimators_):
        x_test = x.values[:, ensemble.estimators_features_[i]]
        if not isinstance(est, sklearn.svm.SVC):
            y_pred = est.predict_proba(x_test)[:, 1]
        else:
            y_pred = est.decision_function(x_test)
        qual[ind] = average_precision_score(y_score=y_pred, y_true=y)
    return qual


base_prauc = {}
for name, model in models.items():
    if not "Bagging" in name and not "RSM" in name:
        continue
    base_prauc[name] = base_model_prauc(model, x_test, y_test)


base_prauc_res = pd.DataFrame(base_prauc)
base_prauc_res = base_prauc_res.melt(
    value_vars=base_prauc_res.columns, value_name="pr_auc", var_name="model"
)
base_prauc_res["base_model"] = base_prauc_res["model"].apply(read_base)
base_prauc_res["ensemble_method"] = base_prauc_res["model"].apply(read_ensemble)


plt.figure(figsize=(16, 12))
plt.subplot(211)
sns.boxplot(data=cor_res, y="paircor", x="base_model", hue="ensemble_method")
plt.title("Pairwise correlations in ensembles", size=25)
plt.xlabel("", size=20)
plt.ylabel("Pairwise correlation", size=20)
plt.legend(fontsize=20, loc="lower left")
plt.tick_params(axis="both", which="major", labelsize=14)

plt.subplot(212)
sns.boxplot(data=base_prauc_res, y="pr_auc", x="base_model", hue="ensemble_method")
plt.title("Base model quality", size=25)
plt.xlabel("", size=20)
plt.ylabel("PR-AUC", size=20)
plt.subplots_adjust()
plt.legend(fontsize=20, loc="lower left")
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


calif_housing = sklearn.datasets.fetch_california_housing()
x = calif_housing.data
y = calif_housing.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

models_rf = {}

# add single decision tree for comparison
models_rf["DT"] = GridSearchCV(
    DecisionTreeRegressor(),
    {"max_depth": [1, 3, 5, 7, 10], "min_samples_leaf": [1, 3, 5, 10]},
)

# this can be done faster, see warm_start parameter for this
# (https://stackoverflow.com/questions/42757892/how-to-use-warm-start)
for n_estimators in [3, 5, 10, 50, 100, 150, 250]:
    models_rf[f"RF{n_estimators}"] = RandomForestRegressor(
        n_estimators=n_estimators, random_state=42, n_jobs=-1
    )  # run in parallel


from sklearn.metrics import mean_squared_error


def train_and_test_regressor(models, x_train, y_train, x_test, y_test, verb=True):
    boot_scores = {}
    for name, model in models.items():
        model.fit(x_train, y_train)  # train the model
        y_pred = model.predict(x_test)  # get predictions
        boot_scores[name] = bootstrap_metric(  # calculate bootstrap score
            y_test,
            y_pred,
            metric_fn=lambda x, y: mean_squared_error(y_true=x, y_pred=y),
        )
        if verb:
            print(f"Fitted {name} with bootstrap score {boot_scores[name].mean():.3f}")

    results = pd.DataFrame(boot_scores)
    # cast to long format https://pandas.pydata.org/docs/reference/api/pandas.melt.html
    results = results.melt(
        value_vars=results.columns, value_name="mse", var_name="model"
    )
    return results


results_rf = train_and_test_regressor(models_rf, x_train, y_train, x_test, y_test)

Fitted DT with bootstrap score 0.395
Fitted RF3 with bootstrap score 0.338
Fitted RF5 with bootstrap score 0.302
Fitted RF10 with bootstrap score 0.277
Fitted RF50 with bootstrap score 0.257
Fitted RF100 with bootstrap score 0.254
Fitted RF150 with bootstrap score 0.252
Fitted RF250 with bootstrap score 0.252


plt.figure(figsize=(16, 6))
sns.boxplot(data=results_rf, y="mse", x="model")
plt.ylabel("MSE", size=20)
plt.xlabel("Models", size=20)
plt.title("Number of estimators vs MSE", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


dt_depth = {}
rf_depth = {}

for depth in range(1, 20, 2):
    dt_depth[depth] = DecisionTreeRegressor(max_depth=depth, random_state=42)

    rf_depth[depth] = RandomForestRegressor(
        n_estimators=100, max_depth=depth, random_state=42, n_jobs=-1
    )  # run in parallel

dt_res = train_and_test_regressor(
    dt_depth, x_train, y_train, x_test, y_test, verb=False
)
rf_res = train_and_test_regressor(
    rf_depth, x_train, y_train, x_test, y_test, verb=False
)

dt_res = dt_res.rename(columns={"model": "tree_depth"})
rf_res = rf_res.rename(columns={"model": "tree_depth"})
dt_res["model"] = "DT"
rf_res["model"] = "RF"
depth_res = pd.concat((dt_res, rf_res))


plt.figure(figsize=(12, 8))
sns.boxplot(data=depth_res, x="tree_depth", y="mse", hue="model")
plt.xlabel("Tree depth", size=20)
plt.ylabel("MSE", size=20)
plt.title("Tree depth vs MSE", size=20)
plt.legend(fontsize=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


dt_models_min_samples = {}
rf_models_min_samples = {}

for mn_sm in [1, 3, 5, 7, 10]:
    dt_models_min_samples[mn_sm] = DecisionTreeRegressor(
        max_depth=None, min_samples_leaf=mn_sm, random_state=42
    )

    rf_models_min_samples[mn_sm] = RandomForestRegressor(
        n_estimators=100,
        max_depth=None,
        min_samples_leaf=mn_sm,
        random_state=42,
        n_jobs=-1,
    )  # run in parallel

dt_results_mn_samples = train_and_test_regressor(
    dt_models_min_samples, x_train, y_train, x_test, y_test
)
rf_results_mn_samples = train_and_test_regressor(
    rf_models_min_samples, x_train, y_train, x_test, y_test
)

dt_results_mn_samples = dt_results_mn_samples.rename(columns={"model": "min_samples"})
rf_results_mn_samples = rf_results_mn_samples.rename(columns={"model": "min_samples"})
dt_results_mn_samples["model"] = "DT"
rf_results_mn_samples["model"] = "RF"
leaf_res = pd.concat((dt_results_mn_samples, rf_results_mn_samples))

Fitted 1 with bootstrap score 0.528
Fitted 3 with bootstrap score 0.446
Fitted 5 with bootstrap score 0.428
Fitted 7 with bootstrap score 0.405
Fitted 10 with bootstrap score 0.388
Fitted 1 with bootstrap score 0.254
Fitted 3 with bootstrap score 0.256
Fitted 5 with bootstrap score 0.260
Fitted 7 with bootstrap score 0.266
Fitted 10 with bootstrap score 0.274


plt.figure(figsize=(12, 8))
sns.boxplot(data=leaf_res, x="min_samples", y="mse", hue="model")
plt.xlabel("Min samples in leaf", size=20)
plt.ylabel("MSE", size=20)
plt.title("Min samples in leaf vs MSE", size=20)
plt.legend(fontsize=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


from sklearn.ensemble import RandomForestClassifier

x, y = sklearn.datasets.make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
plt.figure(figsize=(16, 6))
plt.subplot(121)

clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(x_train, y_train)
plot_decision_boundary(clf, x, y)
plt.title("Single Decision Tree", fontsize=14)

plt.subplot(122)
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train, y_train)
plot_decision_boundary(rf, x, y)
plt.title("Random forest", fontsize=14)
plt.show()


plt.figure(figsize=(16, 6))

plt.subplot(121)
rf1 = RandomForestClassifier(n_estimators=1000)
rf1.fit(x_train, y_train)
plot_decision_boundary(rf1, x, y)
plt.title("Random forest", fontsize=14)

plt.subplot(122)
rf2 = RandomForestClassifier(n_estimators=1000, min_samples_leaf=5)
rf2.fit(x_train, y_train)
plot_decision_boundary(rf2, x, y)
plt.title("Random forest, min_samples_leaf=5", fontsize=14)
plt.show()


y_score = rf1.predict(x_train)
q = average_precision_score(y_true=y_train, y_score=y_score)
print(f"RF1 Train: {q:.02}")
y_score = rf1.predict(x_test)
q = average_precision_score(y_true=y_test, y_score=y_score)
print(f"RF1 Test: {q:.02}")

y_score = rf2.predict(x_train)
q = average_precision_score(y_true=y_train, y_score=y_score)
print(f"RF2 Train: {q:.02}")
y_score = rf2.predict(x_test)
q = average_precision_score(y_true=y_test, y_score=y_score)
print(f"RF2 Test: {q:.02}")

RF1 Train: 1.0
RF1 Test: 0.87
RF2 Train: 0.91
RF2 Test: 0.91


from sklearn.model_selection import cross_validate

rf_oob = RandomForestClassifier(n_estimators=1000, min_samples_leaf=5, oob_score=True)
rf_oob.fit(x, y)
print(f"RF OOB score: {rf_oob.oob_score_:.02}")

scores = cross_validate(rf_oob, x, y, cv=5)
print(f"RF CV score: {scores['test_score'].mean():.02}")

rf_oob.fit(x_train, y_train)
print(f"RF Test set score: {rf_oob.score(x_test, y_test):.02}")

RF OOB score: 0.92
RF CV score: 0.91
RF Test set score: 0.92


plt.figure(figsize=(16, 8))

# Decision trees
for i, max_depth in enumerate([1, 3, 5, 12]):
    plt.subplot(241 + i)
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    plot_decision_boundary(dt, x, y, alpha=0.5, bolded=True)
    plt.xticks([], [])
    plt.yticks([], [])
    plt.title(f"Decision tree, max_depth={max_depth}", fontsize=14)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([], [])
    plt.yticks([], [])

# Random forests
for i, max_depth in enumerate([1, 3, 5, 12]):
    plt.subplot(245 + i)
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=500, n_jobs=-1)
    rf.fit(x_train, y_train)
    plot_decision_boundary(rf, x, y, alpha=0.5, bolded=True)
    plt.title(f"Random forest, max_depth={max_depth}", fontsize=14)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([], [])
    plt.yticks([], [])


calif_housing = sklearn.datasets.fetch_california_housing()
x = calif_housing.data
y = calif_housing.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


from sklearn.metrics import mean_squared_error


def train_and_test_regressor(models, x_train, y_train, x_test, y_test):
    for name, model in models.items():
        print(f"Fitting {name}")
        model.fit(x_train, y_train)
    predictions = {}
    for name, model in models.items():
        y_pred = model.predict(x_test)
        predictions[name] = y_pred

    boot_scores = {}

    for name, y_pred in predictions.items():
        print(f"Calculating bootstrap score for {name}")
        boot_score = bootstrap_metric(
            y_test,
            y_pred,
            metric_fn=lambda x, y: mean_squared_error(y_true=x, y_pred=y),
        )
        boot_scores[name] = boot_score

    results = pd.DataFrame(boot_scores)
    # cast to long format
    results = results.melt(
        value_vars=results.columns, value_name="mse", var_name="model"
    )
    return results


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

models = {}

# make pipeline for normalization
models["LinReg"] = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

models["RF"] = RandomForestRegressor(
    n_estimators=250,  # for better result set to 1000
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=42,
)

models["GradientBoosting"] = GradientBoostingRegressor(
    learning_rate=0.1,  # for better result set to 0.05
    n_estimators=250,  # for better result set to 1000
    random_state=42,
)

results_boost = train_and_test_regressor(models, x_train, y_train, x_test, y_test)

Fitting LinReg
Fitting RF
Fitting GradientBoosting
Calculating bootstrap score for LinReg
Calculating bootstrap score for RF
Calculating bootstrap score for GradientBoosting


plt.figure(figsize=(12, 4))
ax = sns.boxplot(data=results_boost, y="mse", x="model")
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("LR vs RF vs GB", size=25)
plt.xticks(size=20)
plt.show()


gbtree = GradientBoostingRegressor(
    n_estimators=300, learning_rate=1.0  # faster learning rate to force ovefitting
)
gbtree.fit(x_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, n_estimators=300)

GradientBoostingRegressor(learning_rate=1.0, n_estimators=300)


error_train = []
error_test = []
for it, (y_train_pred, y_test_pred) in enumerate(
    zip(gbtree.staged_predict(x_train), gbtree.staged_predict(x_test))
):
    ertr = mean_squared_error(y_true=y_train, y_pred=y_train_pred)
    error_train.append(ertr)
    erte = mean_squared_error(y_true=y_test, y_pred=y_test_pred)
    error_test.append(erte)


plt.figure(figsize=(10, 5))
plt.plot(error_train, label="train", c="#2DA9E1", linewidth=4)
plt.plot(error_test, label="test", c="#4AAE4D", linewidth=4)
plt.xlabel("n_estimators", size=20)
plt.ylabel("mse", size=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=20)
plt.show()


# here anb below in the cell can be set to 1000 for better visualization

gbtrees_list = []

for lr in [1, 0.5, 0.1, 0.05, 0.01]:
    gbtree = GradientBoostingRegressor(n_estimators=500, learning_rate=lr)
    gbtree.fit(x_train, y_train)
    gbtrees_list.append(gbtree)


lr = []
step = []
mse = []
for gb_tree in gbtrees_list:
    for it, y_test_pred in enumerate(gb_tree.staged_predict(x_test)):
        erte = mean_squared_error(y_true=y_test, y_pred=y_test_pred)
        mse.append(erte)
        lr.append(str(gb_tree.learning_rate))
        step.append(it)

df = pd.DataFrame({"learning_rate": lr, "n_estimators": step, "mse": mse})

plt.figure(figsize=(16, 6))
sns.lineplot(data=df, x="n_estimators", y="mse", hue="learning_rate", lw=3)
plt.xlabel("n_estimators", size=20)
plt.ylabel("mse", size=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylim((0.2, 0.6))
plt.legend(fontsize=20)
plt.show()


x_learn, x_valid, y_learn, y_valid = train_test_split(x_train, y_train, random_state=42)


gbtree = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1)
gbtree.fit(x_learn, y_learn)

error_train = []
error_test = []
for it, (y_learn_pred, y_valid_pred) in enumerate(
    zip(gbtree.staged_predict(x_learn), gbtree.staged_predict(x_valid))
):
    ertr = mean_squared_error(y_true=y_learn, y_pred=y_learn_pred)
    error_train.append(ertr)
    erte = mean_squared_error(y_true=y_valid, y_pred=y_valid_pred)
    error_test.append(erte)

plt.figure(figsize=(10, 5))
plt.plot(error_train, label="train", c="#2DA9E1", linewidth=4)
plt.plot(error_test, label="test", c="#4AAE4D", linewidth=4)
plt.xlabel("n_estimators", size=20)
plt.ylabel("mse", size=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=20)
plt.show()


models = {}
for depth in (1, 2, 3, 5, 10):
    models[depth] = GradientBoostingRegressor(
        n_estimators=500, learning_rate=0.1, max_depth=depth, random_state=42
    )

depth_boost = train_and_test_regressor(models, x_learn, y_learn, x_valid, y_valid)

Fitting 1
Fitting 2
Fitting 3
Fitting 5
Fitting 10
Calculating bootstrap score for 1
Calculating bootstrap score for 2
Calculating bootstrap score for 3
Calculating bootstrap score for 5
Calculating bootstrap score for 10


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=depth_boost, y="mse", x="model")
plt.xlabel("GB depth", size=20)
plt.ylabel("MSE", size=20)
plt.title("GB depth vs MSE", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


models_add = {}
for depth in (4, 6, 7):
    models_add[depth] = GradientBoostingRegressor(
        n_estimators=500, learning_rate=0.1, max_depth=depth, random_state=42
    )

depth_boost_add = train_and_test_regressor(
    models_add, x_learn, y_learn, x_valid, y_valid
)

Fitting 4
Fitting 6
Fitting 7
Calculating bootstrap score for 4
Calculating bootstrap score for 6
Calculating bootstrap score for 7


depth_boost_joined = pd.concat([depth_boost, depth_boost_add])

plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=depth_boost_joined, y="mse", x="model", order=[1, 2, 3, 4, 5, 6, 7, 10]
)
plt.xlabel("GB depth", size=20)
plt.ylabel("MSE", size=20)
plt.title("GB depth vs MSE", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


models = {}
for min_samples_leaf in (1, 3, 5, 7, 9, 11):
    models[min_samples_leaf] = GradientBoostingRegressor(
        n_estimators=500,
        learning_rate=0.1,
        max_depth=5,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
    )

mns_boost = train_and_test_regressor(models, x_learn, y_learn, x_valid, y_valid)

Fitting 1
Fitting 3
Fitting 5
Fitting 7
Fitting 9
Fitting 11
Calculating bootstrap score for 1
Calculating bootstrap score for 3
Calculating bootstrap score for 5
Calculating bootstrap score for 7
Calculating bootstrap score for 9
Calculating bootstrap score for 11


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=mns_boost, y="mse", x="model")
plt.xlabel("Min samples in leaf", size=20)
plt.ylabel("MSE", size=20)
plt.title("GB min samples leaf vs MSE", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.show()


gbtree = GradientBoostingRegressor(
    n_estimators=1000,
    max_depth=5,
    min_samples_leaf=9,
    learning_rate=0.1,
    random_state=42,
)
gbtree.fit(x_learn, y_learn)

error_train = []
error_test = []
for it, (y_learn_pred, y_valid_pred) in enumerate(
    zip(gbtree.staged_predict(x_learn), gbtree.staged_predict(x_valid))
):
    ertr = mean_squared_error(y_true=y_learn, y_pred=y_learn_pred)
    error_train.append(ertr)
    erte = mean_squared_error(y_true=y_valid, y_pred=y_valid_pred)
    error_test.append(erte)

plt.figure(figsize=(10, 5))
plt.plot(error_train, label="train", c="#2DA9E1", linewidth=4)
plt.plot(error_test, label="test", c="#4AAE4D", linewidth=4)
plt.xlabel("n_estimators", size=20)
plt.ylabel("mse", size=20)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=20)
plt.show()


models = {}

models["LinReg"] = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

models["RF"] = RandomForestRegressor(
    n_estimators=250, max_depth=None, min_samples_leaf=1, n_jobs=-1, random_state=42
)

models["GBR"] = GradientBoostingRegressor(
    learning_rate=0.1, n_estimators=250, random_state=42
)

models["GBR tuned"] = GradientBoostingRegressor(
    learning_rate=0.1,
    n_estimators=500,
    max_depth=5,
    min_samples_leaf=9,
    random_state=42,
)

tuned_boost = train_and_test_regressor(models, x_train, y_train, x_test, y_test)

Fitting LinReg
Fitting RF
Fitting GBR
Fitting GBR tuned
Calculating bootstrap score for LinReg
Calculating bootstrap score for RF
Calculating bootstrap score for GBR
Calculating bootstrap score for GBR tuned


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=tuned_boost, y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(fontsize=20)
plt.show()


from sklearn.ensemble import GradientBoostingClassifier

x, y = sklearn.datasets.make_moons(n_samples=500, noise=0.30, random_state=42)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)

plt.figure(figsize=(16, 12))

# Decision trees
for i, max_depth in enumerate([1, 3, 5, 12]):
    plt.subplot(3, 4, 1 + i)
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(x_train, y_train)
    plot_decision_boundary(dt, x, y, alpha=0.4, bolded=True)
    plt.xticks([], [])
    plt.yticks([], [])
    plt.title(f"Decision tree, max_depth={max_depth}", fontsize=13)
    plt.xlabel("")
    plt.ylabel("")

# Random forests
for i, max_depth in enumerate([1, 3, 5, 12]):
    plt.subplot(3, 4, 5 + i)
    rf = RandomForestClassifier(max_depth=max_depth, n_estimators=500, n_jobs=-1)
    rf.fit(x_train, y_train)
    plot_decision_boundary(rf, x, y, alpha=0.4, bolded=True)
    plt.title(f"Random forest, max_depth={max_depth}", fontsize=13)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([], [])
    plt.yticks([], [])

# Gradient boostings
for i, max_depth in enumerate([1, 3, 5, 12]):
    plt.subplot(3, 4, 9 + i)
    boost = GradientBoostingClassifier(max_depth=max_depth, n_estimators=250)
    boost.fit(x_train, y_train)
    plot_decision_boundary(boost, x, y, alpha=0.4, bolded=True)
    plt.title(f"Gradient boosting, max_depth={max_depth}", fontsize=13)
    plt.xlabel("")
    plt.ylabel("")
    plt.xticks([], [])
    plt.yticks([], [])


calif_housing = sklearn.datasets.fetch_california_housing()
x = calif_housing.data
y = calif_housing.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)


import xgboost

models_add = {}
models_add["xgb"] = xgboost.XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    min_child_weight=9,  # not exact analogue for min_samples_leaf
    n_jobs=-1,  # can be constructed in parrallel, much!!! faster)
    objective="reg:squarederror",
)

xgb_add = train_and_test_regressor(models_add, x_train, y_train, x_test, y_test)

Fitting xgb
Calculating bootstrap score for xgb


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=pd.concat([tuned_boost, xgb_add]), y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models = {}
for min_child_weight in (1, 2, 3, 5, 7, 9, 11, 13, 15):
    models[f"xGB_mnw{min_child_weight}"] = xgboost.XGBRegressor(
        n_estimators=2000,
        learning_rate=0.1,
        max_depth=5,
        random_state=42,
        min_child_weight=min_child_weight,
        n_jobs=-1,
        objective="reg:squarederror",
    )

xgb_mw = train_and_test_regressor(models, x_train, y_train, x_test, y_test)

Fitting xGB_mnw1
Fitting xGB_mnw2
Fitting xGB_mnw3
Fitting xGB_mnw5
Fitting xGB_mnw7
Fitting xGB_mnw9
Fitting xGB_mnw11
Fitting xGB_mnw13
Fitting xGB_mnw15
Calculating bootstrap score for xGB_mnw1
Calculating bootstrap score for xGB_mnw2
Calculating bootstrap score for xGB_mnw3
Calculating bootstrap score for xGB_mnw5
Calculating bootstrap score for xGB_mnw7
Calculating bootstrap score for xGB_mnw9
Calculating bootstrap score for xGB_mnw11
Calculating bootstrap score for xGB_mnw13
Calculating bootstrap score for xGB_mnw15


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=xgb_mw, y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("XGB min_child_weight", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models_add2 = {}
models_add2["xgb_mcw"] = xgboost.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    min_child_weight=9,
    n_jobs=-1,
    objective="reg:squarederror",
)

xgb_add2 = train_and_test_regressor(models_add2, x_train, y_train, x_test, y_test)

Fitting xgb_mcw
Calculating bootstrap score for xgb_mcw


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=pd.concat([tuned_boost, xgb_add, xgb_add2]), y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


import lightgbm

models_add3 = {}
models_add3["lightgbm"] = lightgbm.LGBMRegressor(
    n_estimators=2000,  # can use more estimators due to SPEEEEEED
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=2**5,
    random_state=42,
    min_child_weight=9,
    n_jobs=-1,
)

lgb_add = train_and_test_regressor(models_add3, x_train, y_train, x_test, y_test)

Fitting lightgbm
Calculating bootstrap score for lightgbm


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat([tuned_boost, xgb_add, xgb_add2, lgb_add]), y="mse", x="model"
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models = {}
for num_leaves in (8, 16, 24, 32, 40):
    models[f"LGB_lvn{num_leaves}"] = lightgbm.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.1,
        max_depth=-1,
        num_leaves=num_leaves,
        random_state=42,
        min_child_weight=10,
        n_jobs=-1,
    )

lgb_nl = train_and_test_regressor(models, x_learn, y_learn, x_valid, y_valid)

Fitting LGB_lvn8
Fitting LGB_lvn16
Fitting LGB_lvn24
Fitting LGB_lvn32
Fitting LGB_lvn40
Calculating bootstrap score for LGB_lvn8
Calculating bootstrap score for LGB_lvn16
Calculating bootstrap score for LGB_lvn24
Calculating bootstrap score for LGB_lvn32
Calculating bootstrap score for LGB_lvn40


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=lgb_nl, y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("LGB num_leaves", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models = {}
for num_leaves in (8, 12, 16, 20):
    models[f"LGB_nl{num_leaves}"] = lightgbm.LGBMRegressor(
        n_estimators=2000,
        learning_rate=0.1,
        max_depth=-1,
        num_leaves=num_leaves,
        random_state=42,
        min_child_weight=10,
        n_jobs=-1,
    )

lgb_nl = train_and_test_regressor(models, x_learn, y_learn, x_valid, y_valid)

Fitting LGB_nl8
Fitting LGB_nl12
Fitting LGB_nl16
Fitting LGB_nl20
Calculating bootstrap score for LGB_nl8
Calculating bootstrap score for LGB_nl12
Calculating bootstrap score for LGB_nl16
Calculating bootstrap score for LGB_nl20


plt.figure(figsize=(16, 6))
ax = sns.boxplot(data=lgb_nl, y="mse", x="model")
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("LGB num_leaves", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models_add4 = {}
models_add4["lightgbm lv12"] = lightgbm.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=12,
    random_state=42,
    min_child_weight=9,
    n_jobs=-1,
)

lgb_add2 = train_and_test_regressor(models_add4, x_train, y_train, x_test, y_test)

Fitting lightgbm lv12
Calculating bootstrap score for lightgbm lv12


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat([tuned_boost, xgb_add, xgb_add2, lgb_add, lgb_add2]),
    y="mse",
    x="model",
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


!pip install -q catboost

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.6/98.6 MB 10.1 MB/s eta 0:00:00


import catboost

models_add4 = {}
models_add4["catboost"] = catboost.CatBoostRegressor(
    iterations=2000,
    learning_rate=0.1,
    depth=5,
    random_state=42,
    min_data_in_leaf=9,
    verbose=0,
)
# task_type="GPU") # can use gpu, but no parallel-cpu option

cat_add = train_and_test_regressor(models_add4, x_train, y_train, x_test, y_test)

Fitting catboost
Calculating bootstrap score for catboost


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat([tuned_boost, xgb_add, xgb_add2, lgb_add, cat_add]),
    y="mse",
    x="model",
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


models_rf = {}

models_rf["xgb_rf"] = xgboost.XGBRFRegressor(
    n_estimators=250,
    colsample_bytree=0.8,
    subsample=0.8,
    reg_lambda=0.001,  # to get deeper, less regularized trees
    max_depth=20,  # trees must be deep
    n_jobs=-1,
    objective="reg:squarederror",
)

models_rf["lgb_rf"] = lightgbm.LGBMRegressor(
    n_estimators=250,
    subsample_freq=1,     # for lgb random forest must be set to 1
    num_leaves=2**14,     # don't forget to change the number of leaves to smth big
    boosting_type="rf",   # set boosteer type
    colsample_bytree=0.8, # for lgb random forest must be less then 1
    subsample=0.8,        # for lgb random forest must be less then 1
    min_child_samples=1,  # to get deeper trees
    n_jobs=-1,
)

rf_add = train_and_test_regressor(models_rf, x_train, y_train, x_test, y_test)

Fitting xgb_rf
Fitting lgb_rf
Calculating bootstrap score for xgb_rf
Calculating bootstrap score for lgb_rf


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat([tuned_boost.query('model == "RF"'), rf_add]), y="mse", x="model"
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


class BlendingRegressor:
    def __init__(self, first_layer_models, second_layer_model):
        self.first_layer_models = {
            name: sklearn.clone(model) for name, model in first_layer_models.items()
        }
        self.second_layer_model = sklearn.clone(second_layer_model)

    def fit_1st_layer(self, x, y):
        for name, model in self.first_layer_models.items():
            print(f"Fitting {name}")
            model.fit(x, y)

    def predict_1st_layer(self, x):
        features = np.zeros((x.shape[0], len(self.first_layer_models)))
        for ind, model in enumerate(self.first_layer_models.values()):
            features[:, ind] = model.predict(x)
        return features

    def fit_2st_layer(self, x, y):
        features = self.predict_1st_layer(x)
        self.second_layer_model.fit(features, y)

    def predict(self, x):
        features = self.predict_1st_layer(x)
        y_pred = self.second_layer_model.predict(features)
        return y_pred


first_layer_models = {}

first_layer_models["linreg"] = make_pipeline(
    StandardScaler(with_mean=False), LinearRegression()
)

first_layer_models["rf"] = RandomForestRegressor(
    n_estimators=250, max_depth=None, min_samples_leaf=1, n_jobs=-1, random_state=42
)

first_layer_models["xgb"] = xgboost.XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    min_child_weight=10,
    n_jobs=-1,
    objective="reg:squarederror",
)

first_layer_models["lgb_tuned"] = lightgbm.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    max_depth=-1,
    num_leaves=12,
    random_state=42,
    min_child_weight=7,
    n_jobs=-1,
)

first_layer_models["catboost"] = catboost.CatBoostRegressor(
    iterations=2000,
    verbose=0,
    learning_rate=0.1,
    depth=5,
    random_state=42,
    min_data_in_leaf=5,
)


x_learn, x_valid, y_learn, y_valid = train_test_split(x_train, y_train, random_state=42)
blend_reg = BlendingRegressor(first_layer_models, LinearRegression())

blend_reg.fit_1st_layer(x_learn, y_learn)
blend_reg.fit_2st_layer(x_valid, y_valid)
y_pred = blend_reg.predict(x_test)
blend_boot = bootstrap_metric(
    y_test, y_pred, metric_fn=lambda x, y: mean_squared_error(y_true=x, y_pred=y)
)

Fitting linreg
Fitting rf
Fitting xgb
Fitting lgb_tuned
Fitting catboost


blend_data = pd.DataFrame({"mse": blend_boot})
blend_data["model"] = "SingleBlending"


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat([tuned_boost, xgb_add, xgb_add2, lgb_add, cat_add, blend_data]),
    y="mse",
    x="model",
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


from IPython.display import clear_output

blending_ensemble = []

# takes some time to run
for i in range(1, 11):
    print(f"Training blender {i}")
    x_learn, x_valid, y_learn, y_valid = train_test_split(
        x_train, y_train, random_state=i * 7 % 13
    )
    blend_reg = BlendingRegressor(first_layer_models, LinearRegression())

    blend_reg.fit_1st_layer(x_learn, y_learn)
    blend_reg.fit_2st_layer(x_valid, y_valid)
    blending_ensemble.append(blend_reg)
    clear_output()


y_pred = 0
for blend_reg in blending_ensemble:
    y_pred += blend_reg.predict(x_test)
y_pred /= len(blending_ensemble)


eblend_boot = bootstrap_metric(
    y_test, y_pred, metric_fn=lambda x, y: mean_squared_error(y_true=x, y_pred=y)
)
eblend_data = pd.DataFrame({"mse": eblend_boot})
eblend_data["model"] = "EnsembleBlending"


np.quantile(eblend_boot, q=[0.025, 0.975])

array([0.17014789, 0.20072305])


np.quantile(cat_add["mse"], q=[0.025, 0.975])

array([0.17445492, 0.20549425])


plt.figure(figsize=(16, 6))
ax = sns.boxplot(
    data=pd.concat(
        [tuned_boost, xgb_add, xgb_add2, lgb_add, cat_add, blend_data, eblend_data]
    ),
    y="mse",
    x="model",
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


base_models = []

base_models.append(
    ["linreg", make_pipeline(StandardScaler(with_mean=False), LinearRegression())]
)

base_models.append(
    [
        "xgb",
        xgboost.XGBRegressor(
            n_estimators=500,
            learning_rate=0.1,
            max_depth=5,
            random_state=42,
            min_child_weight=10,
            n_jobs=-1,
            objective="reg:squarederror",
        ),
    ]
)

base_models.append(
    [
        "lgb",
        lightgbm.LGBMRegressor(
            n_estimators=2000,
            learning_rate=0.1,
            max_depth=-1,
            num_leaves=12,
            random_state=42,
            min_child_weight=7,
            n_jobs=-1,
        ),
    ]
)

base_models.append(
    [
        "cgb",
        catboost.CatBoostRegressor(
            iterations=2000,
            verbose=0,
            learning_rate=0.1,
            depth=5,
            random_state=42,
            min_data_in_leaf=5,
        ),
    ]
)


from sklearn.ensemble import StackingRegressor

stacking_reg = StackingRegressor(
    base_models, LinearRegression(), cv=3  # level-two model is Linear Regression
)

stacking_reg.fit(x_train, y_train)
clear_output()


y_pred = stacking_reg.predict(x_test)
print(mean_squared_error(y_true=y_test, y_pred=y_pred))

0.18633737889209778


stack_boot = bootstrap_metric(
    y_test, y_pred, metric_fn=lambda x, y: mean_squared_error(y_true=x, y_pred=y)
)

stack_data = pd.DataFrame({"mse": stack_boot})
stack_data["model"] = "Stacking"


plt.figure(figsize=(18, 6))
ax = sns.boxplot(
    data=pd.concat(
        [
            tuned_boost,
            xgb_add,
            xgb_add2,
            lgb_add,
            cat_add,
            blend_data,
            eblend_data,
            stack_data,
        ]
    ),
    y="mse",
    x="model",
)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel("", size=20)
plt.ylabel("MSE", size=20)
plt.title("Boosting and others", size=20)
plt.tick_params(axis="both", which="major", labelsize=14)
plt.xticks(size=20)
plt.show()


np.quantile(stack_data["mse"], q=[0.025, 0.975])

array([0.17123641, 0.20178981])


np.quantile(cat_add["mse"], q=[0.025, 0.975])

array([0.17445492, 0.20549425])


from IPython.display import clear_output

!pip install -q pytorch-tabnet
clear_output()


import sklearn
import sklearn.datasets
from sklearn.model_selection import train_test_split

calif_housing = sklearn.datasets.fetch_california_housing()
x = calif_housing.data
y = calif_housing.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
x_learn, x_valid, y_learn, y_valid = train_test_split(x_train, y_train, random_state=42)


from pytorch_tabnet.tab_model import TabNetRegressor
from warnings import simplefilter

simplefilter("ignore", category=UserWarning)

tabnet = TabNetRegressor()
tabnet.fit(
    x_learn, y_learn.reshape(-1, 1), eval_set=[(x_valid, y_valid.reshape(-1, 1))]
)

epoch 0  | loss: 3.51207 | val_0_mse: 1076.38071|  0:00:03s
epoch 1  | loss: 0.92624 | val_0_mse: 5.24164 |  0:00:04s
epoch 2  | loss: 0.71727 | val_0_mse: 329.54894|  0:00:04s
epoch 3  | loss: 0.58757 | val_0_mse: 325.48701|  0:00:05s
epoch 4  | loss: 0.51347 | val_0_mse: 80.16469|  0:00:06s
epoch 5  | loss: 0.48811 | val_0_mse: 651.67741|  0:00:06s
epoch 6  | loss: 0.45026 | val_0_mse: 193.50969|  0:00:07s
epoch 7  | loss: 0.44329 | val_0_mse: 112.20831|  0:00:08s
epoch 8  | loss: 0.41462 | val_0_mse: 168.46524|  0:00:09s
epoch 9  | loss: 0.40728 | val_0_mse: 318.11486|  0:00:10s
epoch 10 | loss: 0.39702 | val_0_mse: 575.0163|  0:00:10s
epoch 11 | loss: 0.37777 | val_0_mse: 459.37161|  0:00:11s

Early stopping occurred at epoch 11 with best_epoch = 1 and best_val_0_mse = 5.24164


tabnet = TabNetRegressor()
tabnet.fit(x_train, y_train.reshape(-1, 1), max_epochs=53)

epoch 0  | loss: 2.72954 |  0:00:00s
epoch 1  | loss: 0.77319 |  0:00:01s
epoch 2  | loss: 0.6015  |  0:00:02s
epoch 3  | loss: 0.53045 |  0:00:02s
epoch 4  | loss: 0.49272 |  0:00:03s
epoch 5  | loss: 0.4568  |  0:00:04s
epoch 6  | loss: 0.44634 |  0:00:04s
epoch 7  | loss: 0.42965 |  0:00:05s
epoch 8  | loss: 0.42114 |  0:00:06s
epoch 9  | loss: 0.42466 |  0:00:07s
epoch 10 | loss: 0.39701 |  0:00:07s
epoch 11 | loss: 0.40175 |  0:00:08s
epoch 12 | loss: 0.38597 |  0:00:09s
epoch 13 | loss: 0.39648 |  0:00:10s
epoch 14 | loss: 0.37438 |  0:00:11s
epoch 15 | loss: 0.35803 |  0:00:12s
epoch 16 | loss: 0.35664 |  0:00:12s
epoch 17 | loss: 0.3555  |  0:00:13s
epoch 18 | loss: 0.353   |  0:00:14s
epoch 19 | loss: 0.35966 |  0:00:15s
epoch 20 | loss: 0.34629 |  0:00:15s
epoch 21 | loss: 0.34293 |  0:00:16s
epoch 22 | loss: 0.33906 |  0:00:17s
epoch 23 | loss: 0.32974 |  0:00:17s
epoch 24 | loss: 0.32725 |  0:00:18s
epoch 25 | loss: 0.33235 |  0:00:19s
epoch 26 | loss: 0.33473 |  0:00:20s
epoch 27 | loss: 0.33017 |  0:00:20s
epoch 28 | loss: 0.32486 |  0:00:21s
epoch 29 | loss: 0.33013 |  0:00:22s
epoch 30 | loss: 0.33028 |  0:00:23s
epoch 31 | loss: 0.33704 |  0:00:24s
epoch 32 | loss: 0.33452 |  0:00:25s
epoch 33 | loss: 0.32133 |  0:00:25s
epoch 34 | loss: 0.32139 |  0:00:26s
epoch 35 | loss: 0.32358 |  0:00:27s
epoch 36 | loss: 0.31803 |  0:00:28s
epoch 37 | loss: 0.31429 |  0:00:28s
epoch 38 | loss: 0.3166  |  0:00:29s
epoch 39 | loss: 0.31205 |  0:00:30s
epoch 40 | loss: 0.3133  |  0:00:30s
epoch 41 | loss: 0.30814 |  0:00:31s
epoch 42 | loss: 0.30986 |  0:00:32s
epoch 43 | loss: 0.30849 |  0:00:33s
epoch 44 | loss: 0.31149 |  0:00:33s
epoch 45 | loss: 0.30858 |  0:00:34s
epoch 46 | loss: 0.30869 |  0:00:35s
epoch 47 | loss: 0.32698 |  0:00:36s
epoch 48 | loss: 0.31701 |  0:00:37s
epoch 49 | loss: 0.30686 |  0:00:38s
epoch 50 | loss: 0.31026 |  0:00:38s
epoch 51 | loss: 0.30128 |  0:00:39s
epoch 52 | loss: 0.30753 |  0:00:40s


from sklearn.metrics import mean_squared_error

y_pred = tabnet.predict(x_test)
print(mean_squared_error(y_true=y_test, y_pred=y_pred))

0.4447459669316955

param type	CatBoost	XGBoost	LightGBM
overfitting control	`learning_rate` `depth` `l2_leaf_reg`	`eta` `max_depth` `min_child_weight`	`learning_rate` `max_depth` `num_leaves` `min_data_in_leaf`
speed of the training	`rsm` `iterations`	`colsample_bytree` `subsample` `n_estimators`	`feature_fraction` `bagging_fraction`

Системы предсказаний¶

Экспертные системы (Rule-based systems)¶

Классическое машинное обучение¶

Глубокое машинное обучение¶

Необходимость методов классического машинного обучения¶

Деревья решений¶

Принцип работы дерева решений¶

Деревья решений (классификация)¶

Как построить дерево решений?¶

Для бинарных признаков¶

Для вещественных признаков¶

Для категориальных признаков¶

Деревья решений (Регрессия)¶

Деревья решений и работа с пропущенными значениями¶

Преимущества и недостатки деревьев решений¶

Почему деревья — очень мощный метод?¶

Неустойчивость деревьев решений

Переобучение деревьев¶

Bias, Variance, Irreducible error¶

Bias¶

Variance¶

Irreducible error¶

Bias vs variance¶

Применительно к деревьям¶

Бутстрэп¶

Корреляция и построение доверительного интервала для нее¶

Пример 1¶

Пример 2¶

Построение доверительного интервала для качества метрики¶

Пример 1 (искусственный)¶

Пример 2 (классифицируем людей с больным сердцем и нет)¶

Ансамбли¶

Корректирующий код¶

Усреднение предсказания классификаторов¶

Зависимость качества ансамбля от качества индивидуального предсказателя и от числа предсказателей¶

Коррелированность моделей¶

Bagging = Bootstrap aggregation¶

Пишем свой bagging¶

Классифицируем людей с больным сердцем и нет, используя bagging¶

Сравним разделяющие плоскости дерева решений и бэггинга на деревьях решений¶

Метод случайных подпространств (RSM, random subspace method)¶

Пишем свой RSM¶

Классифицируем людей с больным сердцем и нет, используя RSM¶

Комбинация RSM и Bagging¶

Почему для одних классов моделей работает, а для других — нет¶

Случайный лес¶

Зависимость качества случайного леса от числа деревьев¶

Зависимость качества случайного леса от глубины дерева¶

Минимальное число объектов в листе¶

Переобучается ли случайный лес?¶

Валидиация случайного леса на Out-Of-Bag (OOB) объектах¶

Случайный лес и bias-variance tradeoff¶

Boosting¶

Gradient boosting (градиентный бустинг)¶

Переобучение¶

Shrinkage (learning rate)¶

Число деревьев в ансамбле¶

Глубина деревьев в градиентном бустинге¶

Минимальное число объектов в листе¶

Параметры subsampling и max_features¶

Понижение learning rate¶

Градиентный бустинг и bias-variance tradeoff¶

Модификации градиентного бустинга¶

XGBoost¶

Параметр min_child_weight¶

LightGBM¶

Максимальное число листьев в дереве¶

CatBoost¶

Про другие реализации случайного леса¶

Про подбор параметров модифицированных бустингов¶

Блендинг и Стэкинг¶

Blending (Блендинг)¶

Стэкинг¶

Стэкинг vs блендинг¶

Некоторые практические рекомендации¶

Какой ML алгоритм в общем случае выбрать: RF или GBDT?¶

Какие деревья решений использовать в качестве элементов случайного леса, а на каких строить градиентный бустинг?¶

У RF значительно меньше гиперпараметров, чем у GBDT. Но как их выбрать?¶

Какие гиперпараметры GBDT реализаций следует подбирать в первую очередь?¶

Применение нейронных сетей к табличным данным¶