import matplotlib.pyplot as plt
import numpy as np


np.random.seed(0)
x = np.random.rand(100, 1)
y = 2 + 3 * x + (np.random.rand(100, 1) - 0.5)

plt.figure(figsize=(5, 3))
plt.scatter(x, y, s=10)
plt.xlabel("x")
plt.ylabel("y")
plt.show()


plt.figure(figsize=(5, 3))
plt.scatter(x, y, s=10)
for w in np.arange(-5.0, 7.0, 1):
    for b in [-1, 0, 1, 2, 3]:
        y_predicted = b + w * x
        plt.plot(x, y_predicted, color="r", alpha=0.3)
plt.xlabel("x")
plt.ylabel("y")
plt.show()


plt.figure(figsize=(5, 3))
plt.scatter(x, y, s=10)
for w in np.arange(-5.0, 7.0, 1):
    for b in [-1, 0, 1, 2, 3]:
        y_predicted = b + w * x
        plt.plot(x, y_predicted, color="r", alpha=0.3)
plt.plot(x, 2 + 3 * x, color="g")
plt.xlabel("x")
plt.ylabel("y")
plt.show()


def plot_delta_line(ax, x, y, w, b, color="r"):
    y_predicted = w * x + b
    # line
    ax.plot(x, y_predicted, color=color, alpha=0.5, label=f"f(x)={w}x+{b}")
    # delta
    for x_i, y_i, f_x in zip(x, y, y_predicted):
        ax.vlines(x=x_i, ymin=min(f_x, y_i), ymax=max(f_x, y_i), ls="--", alpha=0.3)
    # MSE
    loss = np.sum((y - (w * x + b)) ** 2) / (len(x))
    ax.set_title(f"MSE = {loss:.3f}")
    ax.legend()


fig, axs = plt.subplots(1, 2, figsize=(11, 4))

# plot x_i y_i (dots)
for ax in axs:
    ax.scatter(x, y, s=10)
    ax.set_xlim([0, 1])
    ax.set_ylim([2, 6])
    ax.set_xlabel("x")
    ax.set_ylabel("y")

plot_delta_line(axs[0], x, y, w=4, b=2, color="r")
plot_delta_line(axs[1], x, y, w=3, b=2, color="g")

plt.show()


w = np.arange(-10, 30, 1)
b = np.arange(-10, 10, 1)

w, b = np.meshgrid(w, b)

loss = np.zeros_like(w)
for i in range(w.shape[0]):
    for j in range(w.shape[1]):
        loss[i, j] = np.sum((y - (w[i, j] * x + b[i, j])) ** 2) / (len(x))

fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
surf = ax.plot_surface(w, b, loss, cmap=plt.cm.RdYlGn_r, alpha=0.5)

ax.contourf(w, b, loss, zdir="z", offset=-1, cmap="RdYlGn_r", alpha=0.5)
ax.set_zlim(0, 20)

ax.set_xlabel("w")
ax.set_ylabel("b")
ax.set_title("MSE")

fig.colorbar(surf, location="left")
plt.show()


def estimate_coef(x, y):
    n = len(x)
    w = (n * sum(np.multiply(x, y)) - sum(x) * sum(y)) / (
        n * sum(np.multiply(x, x)) - sum(x) ** 2
    )
    b = (sum(y) - w * sum(x)) / n
    return w, b


w, b = estimate_coef(x, y)

y_predicted = w * x + b

print(f"Estimated coefficients:\nb = {b[0]:.3f} \nw = {w[0]:.3f}")
print(f"Final equation: \ny = {w[0]:.3f}x +{b[0]:.3f}")

plt.figure(figsize=(5, 3))
plt.scatter(x, y, s=10)
plt.plot(x, y_predicted, color="g")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

Estimated coefficients:
b = 2.058 
w = 2.937
Final equation: 
y = 2.937x +2.058


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def print_metrics(y_true, y_predicted):
    print(f"Mean squared error: {mean_squared_error(y_true, y_predicted):.3f}")
    print(
        "Root mean squared error: ",
        f"{mean_squared_error(y_true, y_predicted, squared=False):.3f}",
    )
    print(f"Mean absolute error: {mean_absolute_error(y_true, y_predicted):.3f}")
    print(f"R2 score: {r2_score(y_true, y_predicted):.3f}")


print_metrics(y, y_predicted)

Mean squared error: 0.076
Root mean squared error:  0.276
Mean absolute error: 0.237
R2 score: 0.904


import pandas as pd

dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/student_scores.csv"
)
print(dataset.shape)
dataset.head()

(25, 2)


import seaborn as sns

sns.jointplot(data=dataset, x="Hours", y="Scores", height=5)
plt.show()


from sklearn.model_selection import train_test_split

x = dataset.iloc[:, :-1].values  # column Hours
y = dataset.iloc[:, 1].values  # column Score

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)


from sklearn.linear_model import LinearRegression

regressor = LinearRegression()


regressor.fit(x_train, y_train)

LinearRegression()

LinearRegression()


x_train.shape

(20, 1)


x_points = np.linspace(min(x_train), max(x_train), 100)  # 100 dots at min to max
y_pred = regressor.predict(x_points)

plt.figure(figsize=(6, 4))
plt.plot(x_train, y_train, "o", label="Scores")
plt.plot(
    x_points,
    y_pred,
    label="y = %.2fx+%.2f" % (regressor.coef_[0], regressor.intercept_),
)
plt.title("Hours vs Percentage", size=12)
plt.xlabel("Hours Studied", size=12)
plt.ylabel("Percentage Score", size=12)
plt.legend()
plt.show()


y_pred = regressor.predict(x_test)

x_points = np.linspace(min(x_test), max(x_test), 100)
y_pred = regressor.predict(x_points)

plt.figure(figsize=(6, 4))
plt.plot(x_test, y_test, "o", label="Scores")
plt.plot(
    x_points,
    y_pred,
    label="y = %.2fx+%.2f" % (regressor.coef_[0], regressor.intercept_),
)
plt.title("Hours vs Percentage", size=12)
plt.xlabel("Hours Studied", size=12)
plt.ylabel("Percentage Score", size=12)
plt.legend()
plt.show()


y_pred = regressor.predict(x_test)
print_metrics(y_test, y_pred)

Mean squared error: 18.943
Root mean squared error:  4.352
Mean absolute error: 3.921
R2 score: 0.968


import numpy as np
import matplotlib.pyplot as plt

f = lambda x, y: np.sin(x * y)

x = np.linspace(0, 4, 1000)
y = np.linspace(0, 4, 1000)
xx, yy = np.meshgrid(x, y)
zz = f(xx, yy)

fig = plt.figure(figsize=(20, 7))


def show_3d(xx, yy, zz, fig):
    ax = fig.add_subplot(121, projection="3d")
    surf = ax.plot_surface(xx, yy, zz, cmap=plt.cm.RdYlGn_r)

    ax.contourf(xx, yy, zz, zdir="zz", offset=-2, cmap="RdYlGn_r")
    ax.set_zlim(-2, 2)

    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_title("sin(xy)")
    fig.colorbar(surf, location="left")


show_3d(xx, yy, zz, fig)
plt.show()


gradf = lambda x, y: (np.cos(x * y) * y, np.cos(x * y) * x)

xsmall = np.linspace(0, 4, 15)
ysmall = np.linspace(0, 4, 15)
xxsmall, yysmall = np.meshgrid(xsmall, ysmall)
gradx, grady = gradf(xxsmall, yysmall)


fig = plt.figure(figsize=(15, 5))
show_3d(xx, yy, zz, fig)

ax = fig.add_subplot(122)
ax.imshow(
    zz,
    extent=(np.min(x), np.max(x), np.min(y), np.max(y)),
    cmap="RdYlGn_r",
    origin="lower",
)
ax.set_xlabel("x")
ax.set_ylabel("y")

ax.quiver(xxsmall, yysmall, gradx, grady)
plt.show()


import pandas as pd
from sklearn.model_selection import train_test_split


dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/student_scores.csv"
)

x = dataset.iloc[:, :-1].values  # column Hours
y = dataset.iloc[:, 1].values  # column Score

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)


# @title *Code for interactive visual
# source: https://github.com/TomasBeuzen/deep-learning-with-pytorch

!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/dev-2.0/L02/interactive_visualization.py


from interactive_visualization import plot_grid_search
from sklearn.metrics import mean_squared_error


slopes = np.arange(5, 15, 0.5)
prediction = {f"{w}": w * x_train[:, 0] + 2.83 for w in slopes}
mse = np.array([mean_squared_error(y_train, w * x_train[:, 0] + 2.83) for w in slopes])
dmse_dw = np.array(
    [(2 * x_train[:, 0] * (w * x_train[:, 0] + 2.83 - y_train)).mean() for w in slopes]
)
plot_grid_search(x_train[:, 0], y_train, slopes, prediction, mse, dmse_dw)


from sklearn.metrics import mean_absolute_error


slopes = np.arange(5, 15, 0.5)
prediction = {f"{w}": w * x_train[:, 0] + 2.83 for w in slopes}
mae = np.array([mean_absolute_error(y_train, w * x_train[:, 0] + 2.83) for w in slopes])
dmae_dw = np.array(
    [
        (x_train[:, 0] * np.sign(w * x_train[:, 0] + 2.83 - y_train)).mean()
        for w in slopes
    ]
)
plot_grid_search(x_train[:, 0], y_train, slopes, prediction, mae, dmae_dw)


def gradient(x, y, w, b):
    return 2 * (x * (w * x + b - y)).mean()


def gradient_descent(x_train, y_train, x_test, y_test, w, alpha, b=2.83, iteration=10):
    """Gradient descent for optimizing slope in simple linear regression"""
    # history
    ws = [w]
    mse_train = [mean_squared_error(y_train, w * x_train + b)]
    dmse_train = []
    mse_test = [mean_squared_error(y_test, w * x_test + b)]
    prediction = {w: w * x_train + b}
    print(
        f"Iteration 0: w = {w:.2f}, Loss_train = {mse_train[0]:.2f}, "
        f"Loss_test = {mse_test[0]:.2f}."
    )
    for i in range(iteration):
        # adjust w based on gradient * learning rate
        grad = gradient(x_train, y_train, w, b)
        w -= alpha * grad  # adjust w based on gradient * learning rate
        # history
        ws.append(w)
        mse_train.append(mean_squared_error(y_train, w * x_train + b))
        dmse_train.append(grad)
        mse_test.append(mean_squared_error(y_test, w * x_test + b))
        prediction[w] = w * x_train + b
        print(
            f"Iteration {i+1}: w = {w:.2f}, Loss_train = {mse_train[i]:.2f}, "
            f"Loss_test = {mse_test[i]:3.2f}."
        )
    return ws, prediction, mse_train, dmse_train, mse_test


slopes, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train[:, 0], y_train, x_test[:, 0], y_test, w=5, alpha=0.01, iteration=7
)

Iteration 0: w = 5.00, Loss_train = 707.84, Loss_test = 793.54.
Iteration 1: w = 7.89, Loss_train = 707.84, Loss_test = 793.54.
Iteration 2: w = 9.00, Loss_train = 130.56, Loss_test = 145.80.
Iteration 3: w = 9.42, Loss_train = 45.98, Loss_test = 42.65.
Iteration 4: w = 9.58, Loss_train = 33.58, Loss_test = 24.38.
Iteration 5: w = 9.64, Loss_train = 31.77, Loss_test = 20.49.
Iteration 6: w = 9.67, Loss_train = 31.50, Loss_test = 19.46.
Iteration 7: w = 9.68, Loss_train = 31.46, Loss_test = 19.13.


from interactive_visualization import plot_gradient_descent

plot_gradient_descent(x_train[:, 0], y_train, slopes, prediction, mse_train, dmse_train)


def plot_mse(mse_train, mse_test):
    plt.figure(figsize=(10, 4))
    plt.title("Learning curve")
    plt.plot(mse_train, label="train")
    plt.plot(mse_test, label="test")
    plt.legend()

    plt.xlabel("iterations", fontsize=12)
    plt.ylabel("MSE Loss", fontsize=12)

    plt.grid(True)
    plt.show()


plot_mse(mse_train, mse_test)


slopes, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train[:, 0], y_train, x_test[:, 0], y_test, w=5, alpha=0.0005, iteration=30
)

Iteration 0: w = 5.00, Loss_train = 707.84, Loss_test = 793.54.
Iteration 1: w = 5.14, Loss_train = 707.84, Loss_test = 793.54.
Iteration 2: w = 5.28, Loss_train = 666.74, Loss_test = 748.16.
Iteration 3: w = 5.42, Loss_train = 628.13, Loss_test = 705.49.
Iteration 4: w = 5.55, Loss_train = 591.87, Loss_test = 665.36.
Iteration 5: w = 5.68, Loss_train = 557.82, Loss_test = 627.61.
Iteration 6: w = 5.80, Loss_train = 525.83, Loss_test = 592.12.
Iteration 7: w = 5.92, Loss_train = 495.79, Loss_test = 558.73.
Iteration 8: w = 6.04, Loss_train = 467.57, Loss_test = 527.33.
Iteration 9: w = 6.15, Loss_train = 441.07, Loss_test = 497.79.
Iteration 10: w = 6.26, Loss_train = 416.17, Loss_test = 470.01.
Iteration 11: w = 6.37, Loss_train = 392.79, Loss_test = 443.88.
Iteration 12: w = 6.47, Loss_train = 370.84, Loss_test = 419.29.
Iteration 13: w = 6.57, Loss_train = 350.21, Loss_test = 396.16.
Iteration 14: w = 6.66, Loss_train = 330.84, Loss_test = 374.40.
Iteration 15: w = 6.76, Loss_train = 312.65, Loss_test = 353.92.
Iteration 16: w = 6.85, Loss_train = 295.56, Loss_test = 334.65.
Iteration 17: w = 6.93, Loss_train = 279.51, Loss_test = 316.53.
Iteration 18: w = 7.02, Loss_train = 264.44, Loss_test = 299.47.
Iteration 19: w = 7.10, Loss_train = 250.28, Loss_test = 283.41.
Iteration 20: w = 7.18, Loss_train = 236.98, Loss_test = 268.30.
Iteration 21: w = 7.26, Loss_train = 224.49, Loss_test = 254.08.
Iteration 22: w = 7.33, Loss_train = 212.76, Loss_test = 240.70.
Iteration 23: w = 7.41, Loss_train = 201.74, Loss_test = 228.10.
Iteration 24: w = 7.48, Loss_train = 191.39, Loss_test = 216.23.
Iteration 25: w = 7.54, Loss_train = 181.67, Loss_test = 205.07.
Iteration 26: w = 7.61, Loss_train = 172.55, Loss_test = 194.55.
Iteration 27: w = 7.67, Loss_train = 163.97, Loss_test = 184.65.
Iteration 28: w = 7.74, Loss_train = 155.92, Loss_test = 175.33.
Iteration 29: w = 7.80, Loss_train = 148.35, Loss_test = 166.56.
Iteration 30: w = 7.85, Loss_train = 141.25, Loss_test = 158.29.


plot_gradient_descent(x_train[:, 0], y_train, slopes, prediction, mse_train, dmse_train)


plot_mse(mse_train, mse_test)


slopes, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train[:, 0], y_train, x_test[:, 0], y_test, w=5, alpha=0.027, iteration=15
)

Iteration 0: w = 5.00, Loss_train = 707.84, Loss_test = 793.54.
Iteration 1: w = 12.80, Loss_train = 707.84, Loss_test = 793.54.
Iteration 2: w = 7.60, Loss_train = 331.92, Loss_test = 300.22.
Iteration 3: w = 11.07, Loss_train = 164.92, Loss_test = 185.76.
Iteration 4: w = 8.76, Loss_train = 90.74, Loss_test = 65.14.
Iteration 5: w = 10.30, Loss_train = 57.79, Loss_test = 58.06.
Iteration 6: w = 9.27, Loss_train = 43.15, Loss_test = 23.93.
Iteration 7: w = 9.96, Loss_train = 36.65, Loss_test = 29.42.
Iteration 8: w = 9.50, Loss_train = 33.76, Loss_test = 18.09.
Iteration 9: w = 9.80, Loss_train = 32.48, Loss_test = 22.24.
Iteration 10: w = 9.60, Loss_train = 31.91, Loss_test = 17.96.
Iteration 11: w = 9.74, Loss_train = 31.66, Loss_test = 20.14.
Iteration 12: w = 9.65, Loss_train = 31.54, Loss_test = 18.39.
Iteration 13: w = 9.71, Loss_train = 31.49, Loss_test = 19.42.
Iteration 14: w = 9.67, Loss_train = 31.47, Loss_test = 18.67.
Iteration 15: w = 9.69, Loss_train = 31.46, Loss_test = 19.15.


plot_gradient_descent(x_train[:, 0], y_train, slopes, prediction, mse_train, dmse_train)


plot_mse(mse_train, mse_test)


slopes, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train[:, 0],
    y_train,
    x_test[:, 0],
    y_test,
    w=5,
    alpha=0.034,
    iteration=5,
)

Iteration 0: w = 5.00, Loss_train = 707.84, Loss_test = 793.54.
Iteration 1: w = 14.82, Loss_train = 707.84, Loss_test = 793.54.
Iteration 2: w = 4.03, Loss_train = 847.72, Loss_test = 823.37.
Iteration 3: w = 15.89, Loss_train = 1016.53, Loss_test = 1132.95.
Iteration 4: w = 2.86, Loss_train = 1220.26, Loss_test = 1206.01.
Iteration 5: w = 17.17, Loss_train = 1466.11, Loss_test = 1624.33.


plot_gradient_descent(x_train[:, 0], y_train, slopes, prediction, mse_train, dmse_train)


plot_mse(mse_train, mse_test)


w = np.array([[0.5], [5]])
w

array([[0.5],
       [5. ]])


x_train = np.hstack((np.ones((x_train.shape[0], 1)), x_train))
x_test = np.hstack((np.ones((x_test.shape[0], 1)), x_test))
x_test

array([[1. , 8.3],
       [1. , 2.5],
       [1. , 2.5],
       [1. , 6.9],
       [1. , 5.9]])


x_train.shape, w.shape

((20, 2), (2, 1))


y_pred = x_test @ w
y_pred

array([[42.],
       [13.],
       [13.],
       [35.],
       [30.]])


y_train = np.expand_dims(y_train, axis=1)
y_test = np.expand_dims(y_test, axis=1)


def gradient(x, y, w):
    """Gradient of mean squared error."""
    return 2 * (x.T @ (x @ w) - x.T @ y) / len(x)


def gradient_descent(x_train, y_train, x_test, y_test, w, alpha, iteration=10):
    """Gradient descent for optimizing slope in simple linear regression"""
    # history
    ws = np.zeros((iteration + 1, 2))
    ws[0] = w[:, 0]
    mse_train = [mean_squared_error(y_train, x_train @ w)]
    dmse_train = []
    mse_test = [mean_squared_error(y_test, x_test @ w)]
    prediction = {(w[0][0], w[1][0]): x_train @ w}

    print(
        f"Iteration 0: b = {w[0][0]:.2f}, w = {w[1][0]:.2f}, "
        f"Loss_train = {mse_train[0]:.2f}, "
        f"Loss_test = {mse_test[0]:.2f}."
    )

    for i in range(iteration):
        # adjust w based on gradient * learning rate
        grad = gradient(x_train, y_train, w)
        w -= alpha * grad  # adjust w based on gradient * learning rate
        # history
        ws[i + 1] = w[:, 0]
        mse_train.append(mean_squared_error(y_train, x_train @ w))
        dmse_train.append(grad)
        mse_test.append(mean_squared_error(y_test, x_test @ w))
        prediction[(w[0][0], w[1][0])] = x_train @ w

        print(
            f"Iteration {i+1}: b = {w[0][0]:.2f}, w = {w[1][0]:.2f}, "
            f"Loss_train = {mse_train[i]:.2f}, "
            f"Loss_test = {mse_test[i]:3.2f}."
        )
    return ws, prediction, mse_train, dmse_train, mse_test


w = np.array([[0.5], [5]])
ws, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train,
    y_train,
    x_test,
    y_test,
    w,
    0.01,
)

Iteration 0: b = 0.50, w = 5.00, Loss_train = 821.48, Loss_test = 915.80.
Iteration 1: b = 1.01, w = 8.12, Loss_train = 821.48, Loss_test = 915.80.
Iteration 2: b = 1.20, w = 9.26, Loss_train = 138.11, Loss_test = 152.87.
Iteration 3: b = 1.28, w = 9.68, Loss_train = 46.20, Loss_test = 41.48.
Iteration 4: b = 1.31, w = 9.84, Loss_train = 33.84, Loss_test = 23.27.
Iteration 5: b = 1.32, w = 9.89, Loss_train = 32.17, Loss_test = 19.64.
Iteration 6: b = 1.33, w = 9.91, Loss_train = 31.95, Loss_test = 18.72.
Iteration 7: b = 1.34, w = 9.92, Loss_train = 31.91, Loss_test = 18.44.
Iteration 8: b = 1.34, w = 9.92, Loss_train = 31.90, Loss_test = 18.35.
Iteration 9: b = 1.35, w = 9.92, Loss_train = 31.90, Loss_test = 18.31.
Iteration 10: b = 1.36, w = 9.92, Loss_train = 31.90, Loss_test = 18.30.


plot_mse(mse_train, mse_test)


from interactive_visualization import plot_grid_search_2d

intercepts = np.arange(-7.5, 12.5, 0.1)  # b
slopes = np.arange(5, 15, 0.1)  # w
plot_grid_search_2d(x_train[:, 1], y_train, slopes, intercepts)


from interactive_visualization import plot_gradient_descent_2d

plot_gradient_descent_2d(
    x_train[:, 1],
    y_train[:, 0],
    ws,
    slopes,
    intercepts,
)


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(np.expand_dims(x_train[:, 1], axis=1)).flatten()
x_test_scaled = scaler.transform(np.expand_dims(x_test[:, 1], axis=1)).flatten()


intercepts = np.arange(40, 60, 0.1)  # b
slopes = np.arange(15, 35, 0.1)  # w

plot_grid_search_2d(x_train_scaled, y_train, slopes, intercepts)


x_train_scaled = np.hstack(
    (np.ones((len(x_train_scaled), 1)), np.expand_dims(x_train_scaled, axis=1)),
)

x_test_scaled = np.hstack(
    (np.ones((len(x_test_scaled), 1)), np.expand_dims(x_test_scaled, axis=1)),
)


w = np.array([[57.0], [33.0]])
ws, prediction, mse_train, dmse_train, mse_test = gradient_descent(
    x_train_scaled, y_train, x_test_scaled, y_test, w, 0.35, iteration=10
)

Iteration 0: b = 57.00, w = 33.00, Loss_train = 146.31, Loss_test = 108.53.
Iteration 1: b = 52.70, w = 26.86, Loss_train = 146.31, Loss_test = 108.53.
Iteration 2: b = 51.40, w = 25.01, Loss_train = 41.79, Loss_test = 21.10.
Iteration 3: b = 51.02, w = 24.46, Loss_train = 32.39, Loss_test = 17.36.
Iteration 4: b = 50.90, w = 24.29, Loss_train = 31.54, Loss_test = 18.27.
Iteration 5: b = 50.86, w = 24.24, Loss_train = 31.46, Loss_test = 18.72.
Iteration 6: b = 50.85, w = 24.23, Loss_train = 31.46, Loss_test = 18.88.
Iteration 7: b = 50.85, w = 24.23, Loss_train = 31.45, Loss_test = 18.92.
Iteration 8: b = 50.85, w = 24.22, Loss_train = 31.45, Loss_test = 18.94.
Iteration 9: b = 50.85, w = 24.22, Loss_train = 31.45, Loss_test = 18.94.
Iteration 10: b = 50.85, w = 24.22, Loss_train = 31.45, Loss_test = 18.94.


plot_mse(mse_train, mse_test)


b = ws[-1][0] - ws[-1][1] * scaler.mean_ / (scaler.var_) ** 0.5
w = ws[-1][1] / (scaler.var_) ** 0.5

print(f"y = {w[0]:.2f}x + {b[0]:.2f}")

y = 9.68x + 2.83


plot_gradient_descent_2d(
    x_train_scaled[:, 1],
    y_train[:, 0],
    ws,
    slopes,
    intercepts,
)


def stochastic_gradient_descent(
    x_train,
    y_train,
    x_test,
    y_test,
    w,
    alpha,
    iteration=10,
    batch_size=None,
):
    """Gradient descent for optimizing slope in simple linear regression"""
    # history
    ws = np.zeros((iteration + 1, 2))
    ws[0] = w[:, 0]
    mse_train = [mean_squared_error(y_train, x_train @ w)]
    dmse_train = []
    mse_test = [mean_squared_error(y_test, x_test @ w)]
    prediction = {(w[0][0], w[1][0]): x_train @ w}

    print(
        f"Iteration 0: b = {w[0][0]:.2f}, w = {w[1][0]:.2f}, "
        f"Loss_train = {mse_train[0]:.2f}, "
        f"Loss_test = {mse_test[0]:.2f}."
    )

    for i in range(iteration):
        if not batch_size:
            x_sample = x_train
            y_sample = y_train
        else:
            indxs = np.random.choice(x_train.shape[0], batch_size)
            x_sample = x_train[indxs, :]
            y_sample = y_train[indxs, :]

        # adjust w based on gradient * learning rate
        grad = gradient(x_sample, y_sample, w)
        w -= alpha * grad  # adjust w based on gradient * learning rate
        # history
        ws[i + 1] = w[:, 0]
        mse_train.append(mean_squared_error(y_train, x_train @ w))
        dmse_train.append(grad)
        mse_test.append(mean_squared_error(y_test, x_test @ w))
        prediction[(w[0][0], w[1][0])] = x_train @ w
        if (i + 1) % 10 == 0:
            print(
                f"Iteration {i+1}: b = {w[0][0]:.2f}, w = {w[1][0]:.2f}, "
                f"Loss_train = {mse_train[i]:.2f}, "
                f"Loss_test = {mse_test[i]:3.2f}."
            )
    return ws, prediction, mse_train, dmse_train, mse_test


w = np.array([[57.0], [33.0]])
ws, prediction, mse_train, dmse_train, mse_test = stochastic_gradient_descent(
    x_train_scaled,
    y_train,
    x_test_scaled,
    y_test,
    w,
    0.02,
    iteration=100,
    batch_size=None,
)

f1 = plot_gradient_descent_2d(
    x_train_scaled[:, 1],
    y_train[:, 0],
    ws,
    slopes,
    intercepts,
    mode="lines",
    title="Batch gradient descent",
)

Iteration 0: b = 57.00, w = 33.00, Loss_train = 146.31, Loss_test = 108.53.
Iteration 10: b = 54.94, w = 30.06, Loss_train = 86.54, Loss_test = 55.92.
Iteration 20: b = 53.57, w = 28.10, Loss_train = 55.80, Loss_test = 30.94.
Iteration 30: b = 52.66, w = 26.80, Loss_train = 42.22, Loss_test = 21.36.
Iteration 40: b = 52.05, w = 25.94, Loss_train = 36.21, Loss_test = 18.09.
Iteration 50: b = 51.65, w = 25.36, Loss_train = 33.56, Loss_test = 17.29.
Iteration 60: b = 51.38, w = 24.98, Loss_train = 32.38, Loss_test = 17.36.
Iteration 70: b = 51.20, w = 24.73, Loss_train = 31.87, Loss_test = 17.68.
Iteration 80: b = 51.08, w = 24.56, Loss_train = 31.64, Loss_test = 18.01.
Iteration 90: b = 51.01, w = 24.45, Loss_train = 31.54, Loss_test = 18.28.
Iteration 100: b = 50.95, w = 24.37, Loss_train = 31.49, Loss_test = 18.49.


np.random.seed(42)
w = np.array([[57.0], [33.0]])
ws_stohastic, prediction, mse_train, dmse_train, mse_test = stochastic_gradient_descent(
    x_train_scaled,
    y_train,
    x_test_scaled,
    y_test,
    w,
    0.02,
    iteration=100,
    batch_size=1,
)
f2 = plot_gradient_descent_2d(
    x_train_scaled[:, 1],
    y_train[:, 0],
    ws_stohastic,
    slopes,
    intercepts,
    mode="lines",
    title="Stochastic gradient descent",
)

Iteration 0: b = 57.00, w = 33.00, Loss_train = 146.31, Loss_test = 108.53.
Iteration 10: b = 54.17, w = 29.83, Loss_train = 74.29, Loss_test = 45.31.
Iteration 20: b = 54.43, w = 28.49, Loss_train = 62.91, Loss_test = 37.09.
Iteration 30: b = 52.67, w = 26.34, Loss_train = 42.71, Loss_test = 22.04.
Iteration 40: b = 51.48, w = 23.96, Loss_train = 32.13, Loss_test = 18.45.
Iteration 50: b = 52.13, w = 25.01, Loss_train = 33.79, Loss_test = 17.58.
Iteration 60: b = 51.83, w = 25.15, Loss_train = 33.67, Loss_test = 17.93.
Iteration 70: b = 52.79, w = 25.67, Loss_train = 37.54, Loss_test = 19.11.
Iteration 80: b = 51.72, w = 24.74, Loss_train = 32.47, Loss_test = 17.52.
Iteration 90: b = 51.25, w = 25.29, Loss_train = 32.72, Loss_test = 17.30.
Iteration 100: b = 50.82, w = 24.18, Loss_train = 31.49, Loss_test = 18.89.


np.random.seed(42)
w = np.array([[57.0], [33.0]])
(
    ws_mini_batch,
    prediction,
    mse_train,
    dmse_train,
    mse_test,
) = stochastic_gradient_descent(
    x_train_scaled,
    y_train,
    x_test_scaled,
    y_test,
    w,
    0.02,
    iteration=100,
    batch_size=5,
)
f3 = plot_gradient_descent_2d(
    x_train_scaled[:, 1],
    y_train[:, 0],
    ws_mini_batch,
    slopes,
    intercepts,
    mode="lines",
    title="Mini-batch gradient descent",
)

Iteration 0: b = 57.00, w = 33.00, Loss_train = 146.31, Loss_test = 108.53.
Iteration 10: b = 54.89, w = 29.67, Loss_train = 79.49, Loss_test = 50.29.
Iteration 20: b = 53.54, w = 27.93, Loss_train = 52.77, Loss_test = 28.57.
Iteration 30: b = 52.28, w = 26.91, Loss_train = 42.28, Loss_test = 21.22.
Iteration 40: b = 51.72, w = 25.83, Loss_train = 35.69, Loss_test = 17.80.
Iteration 50: b = 51.10, w = 24.92, Loss_train = 32.10, Loss_test = 17.53.
Iteration 60: b = 50.84, w = 24.91, Loss_train = 32.06, Loss_test = 17.59.
Iteration 70: b = 51.04, w = 24.98, Loss_train = 32.12, Loss_test = 17.46.
Iteration 80: b = 51.28, w = 24.72, Loss_train = 31.93, Loss_test = 17.63.
Iteration 90: b = 51.03, w = 24.41, Loss_train = 31.56, Loss_test = 18.33.
Iteration 100: b = 50.85, w = 23.98, Loss_train = 31.48, Loss_test = 19.36.


from interactive_visualization import plot_panel

plot_panel(f1, f2, f3)


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

custom_cmap = ListedColormap(["#B8E1EC", "#bea6ff", "#FEE7D0"])


def generate_data(total_len=40):
    x = np.hstack(
        [
            np.random.uniform(14, 21, total_len // 2),
            np.random.uniform(24, 33, total_len // 2),
        ]
    )
    y = np.hstack([np.zeros(total_len // 2), np.ones(total_len // 2)])
    return x, y


def plot_data_1d(
    x,
    y,
    total_len=40,
    s=50,
    threshold=None,
    margin=None,
    legend=["Normal", "Obese"],
    marker="o",
):
    ax = sns.scatterplot(x=x, y=np.zeros(len(x)), hue=y, s=s, marker=marker)
    if threshold:
        x_lim, y_lim = ax.get_xlim(), ax.get_ylim()
        XX, YY = np.meshgrid(
            np.linspace(x_lim[0], x_lim[1], 100), np.linspace(y_lim[0], y_lim[1], 100)
        )
        pred = np.sign(XX - threshold)
        plt.contourf(XX, YY, pred, alpha=0.3, cmap=custom_cmap)
        ax.axvline(threshold, color="grey")
    if margin:
        for line in margin:
            ax.axvline(line, color="grey", ls="dashed")
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, legend)
    ax.set(xlabel="Mass, g")
    return ax


total_len = 40
np.random.seed(42)
x, y = generate_data(total_len=total_len)
plt.figure(figsize=(5, 3))
ax = plot_data_1d(x, y, threshold=21.5, total_len=total_len)


x_test = np.random.uniform(14, 30, 5)


def classify(x, threshold=21.5):
    y = np.zeros_like(x)
    y[x > threshold] = 1
    return y


total_len = 40
threshold = 21.5

plt.figure(figsize=(5, 3))
ax = plot_data_1d(x, y, threshold=threshold, total_len=total_len)
ax = plot_data_1d(
    x_test, classify(x_test, threshold), total_len=total_len, s=500, marker="*"
)


normal_limit = x[y == 0].max()  # extreme point for 'normal'
obese_limit = x[y == 1].min()  # extreme point for 'obese'

threshold = np.mean([normal_limit, obese_limit])  # separated with mean value

plt.figure(figsize=(5, 3))
ax = plot_data_1d(
    x, y, total_len=total_len, threshold=threshold, margin=[normal_limit, obese_limit]
)
ax = plot_data_1d(
    x_test,
    classify(x_test, threshold=threshold),
    total_len=total_len,
    s=500,
    marker="*",
)


margins = np.abs(x_test - threshold)
print(margins)

[6.65109958 0.68088078 8.053495   5.94541509 4.46323163]


margin_0 = np.abs(normal_limit - threshold)
margin_1 = np.abs(obese_limit - threshold)
print(margin_0, margin_1)

1.8143423746730107 1.8143423746730072


from sklearn.datasets import make_blobs


def generate_2d_data(total_len=40):
    x, y = make_blobs(n_samples=total_len, centers=2, random_state=42)
    x[:, 0] += 10
    x[:, 1] += 20
    return x, y


total_len = 40
x, y = generate_2d_data(total_len=total_len)

fig = plt.figure()
ax = sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, s=50)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ["Normal", "Obese"])
ax.set(xlabel="Mass, g", ylabel="Length, cm")
plt.show()


from sklearn import svm

# Code for illustration, later we will understand how it works
# fit the model, don't regularize for illustration purposes
clf = svm.SVC(kernel="linear", C=1000)
clf.fit(x, y)

fig, axs = plt.subplots(1, 2, figsize=(10, 3))

# first fig
sns.scatterplot(x=x[:, 0], y=x[:, 1], hue=y, s=50, ax=axs[0])
handles, labels = axs[0].get_legend_handles_labels()
axs[0].legend(handles, ["Normal", "Obese"])
axs[0].set(xlabel="Mass, g", ylabel="Length, cm")

# plot the decision function
delta = 0.5
# create grid to evaluate model
YY, XX = np.meshgrid(
    np.linspace(x[:, 1].min() - delta, x[:, 1].max() + delta, 30),
    np.linspace(x[:, 0].min() - delta, x[:, 0].max() + delta, 30),
)
xy = np.vstack([XX.ravel(), YY.ravel()]).T
Z = clf.decision_function(xy).reshape(XX.shape)
pred = np.sign(Z)
axs[0].contourf(XX, YY, pred, alpha=0.3, cmap=custom_cmap)

# plot decision boget_xlimundary and margins
axs[0].contour(
    XX, YY, Z, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"]
)
# plot support vectors
axs[0].scatter(
    clf.support_vectors_[:, 0],
    clf.support_vectors_[:, 1],
    s=100,
    linewidth=1,
    facecolors="none",
    edgecolors="k",
)

# second fig
dec_val = clf.decision_function(x)
sns.scatterplot(x=dec_val, y=np.zeros(len(x)), hue=y, ax=axs[1])

x_lim, y_lim = axs[1].get_xlim(), axs[1].get_ylim()
XX, YY = np.meshgrid(
    np.linspace(x_lim[0], x_lim[1], 100), np.linspace(y_lim[0], y_lim[1], 100)
)
pred = np.sign(XX)
axs[1].contourf(XX, YY, pred, alpha=0.3, cmap=custom_cmap)

axs[1].axvline(0, color="grey")
axs[1].axvline(-1, color="grey", ls="dashed")
axs[1].axvline(1, color="grey", ls="dashed")
handles, labels = axs[1].get_legend_handles_labels()
axs[1].legend(handles, ["Normal", "Obese"])
axs[1].set(xlabel="wx+b")

sv = clf.decision_function(clf.support_vectors_)
axs[1].scatter(
    sv, np.zeros_like(sv), s=100, linewidth=1, facecolors="none", edgecolors="k"
)
plt.show()


def generate_3d_data(total_len=40):
    x, y = make_blobs(n_samples=total_len, centers=2, random_state=42, n_features=3)
    x[:, 0] += 10
    x[:, 1] += 20
    x[:, 2] += 10
    return x, y


def plot_data(x, y, total_len=40, s=50, threshold=21.5):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(
        xs=x[:, 0], ys=x[:, 1], zs=x[:, 2], c=y, s=s, cmap="tab10", vmin=0, vmax=9
    )
    # plot the decision function
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # create grid to evaluate model
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    ax.plot_surface(XX, YY, XX * YY * 0.2, alpha=0.2)
    handles, labels = ax.get_legend_handles_labels()
    ax.set(xlabel="Mass, g", ylabel="Length, cm", zlabel="Age, days")
    return ax


total_len = 40
x, y = generate_3d_data(total_len=total_len)
ax = plot_data(x, y, total_len=total_len)


centers = [[1, 1], [1, -1], [-1, -1], [-1, 1]]

x, y = make_blobs(n_samples=300, centers=centers, cluster_std=0.50, random_state=42)


dark_colors = ["#1B1464", "#0961A5", "#754C24", "#006837"]
bright_colors = ["#5D5DA6", "#2DA9E1", "#F9B041", "#4AAE4D"]
dull_cmap = ListedColormap(["#D1D5ED", "#B8E1EC", "#FEE7D0", "#C9E3C8"])

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

# first fig
sns.scatterplot(
    x=x[:, 0], y=x[:, 1], hue=y, s=50, ax=ax, palette=sns.color_palette(bright_colors)
)

handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ["0", "1", "2", "3"])
ax.set(xlabel="feature 1", ylabel="feature 2")

plt.xlim([-2.5, 2.5])
plt.ylim([-2.5, 2.5])
plt.show()


clf = svm.LinearSVC()
clf.fit(x, y)

LinearSVC()

LinearSVC()


from sklearn.inspection import DecisionBoundaryDisplay

fig, ax = plt.subplots(1, 1, figsize=(5, 5))

disp = DecisionBoundaryDisplay.from_estimator(
    clf,
    x,
    response_method="predict",
    cmap=dull_cmap,
    alpha=0.8,
    xlabel="feature 1",
    ylabel="feature 2",
    ax=ax,
)

# Plot the training points
sns.scatterplot(
    x=x[:, 0], y=x[:, 1], hue=y, s=50, ax=ax, palette=sns.color_palette(bright_colors)
)

# create grid to evaluate model
xx = np.linspace(-2.5, 2.5)
# for visualization
arrow_xs = [0.5, 0.5, -0.5, -0.5]
for i in range(clf.coef_.shape[0]):
    coef = clf.coef_[i]
    w = -coef[0] / coef[1]
    b = -clf.intercept_[0] / coef[1]
    yy = w * xx + b
    # normal
    plt.arrow(
        arrow_xs[i],
        w * arrow_xs[i] + b,
        coef[0] / 4,
        coef[1] / 4,
        edgecolor=dark_colors[i],
        facecolor=bright_colors[i],
        width=0.04,
    )
    # dividing line
    plt.plot(xx, yy, dark_colors[i])

plt.xlim([-2.5, 2.5])
plt.ylim([-2.5, 2.5])

plt.show()


clf = svm.SVC(kernel="linear")
clf.fit(x, y)

SVC(kernel='linear')

SVC(kernel='linear')


fig, ax = plt.subplots(1, 1, figsize=(5, 5))

disp = DecisionBoundaryDisplay.from_estimator(
    clf,
    x,
    response_method="predict",
    cmap=dull_cmap,
    alpha=0.8,
    xlabel="feature 1",
    ylabel="feature 2",
    ax=ax,
)

# Plot the training points
sns.scatterplot(
    x=x[:, 0], y=x[:, 1], hue=y, s=50, ax=ax, palette=sns.color_palette(bright_colors)
)

# for visualization
arrow_xs = [1, -0.1, 0, -0.17, -0.17, -1]
colors_list = [(0, 1), (0, 2), (0, 3), (1, 2), (1, 3), (2, 3)]
range_list = [(0, 2.5), (-0.3, 0.1), (-0.1, 0.5), (-1, -0.12), (-0.4, 0), (-2.5, 0)]

for i in range(clf.coef_.shape[0]):
    xx = np.linspace(*range_list[i])
    coef = clf.coef_[i]
    w = -coef[0] / coef[1]
    b = -clf.intercept_[0] / coef[1]
    yy = w * xx + b
    # normal
    plt.arrow(
        arrow_xs[i],
        w * arrow_xs[i] + b,
        coef[0] / 4,
        coef[1] / 4,
        edgecolor=dark_colors[colors_list[i][0]],
        facecolor=bright_colors[colors_list[i][0]],
        width=0.04,
    )
    # dividing line
    plt.plot(xx, yy, dark_colors[colors_list[i][1]])

plt.xlim([-2.5, 2.5])
plt.ylim([-2.5, 2.5])

plt.show()


from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np


def generate_patients_data(total_len=40):
    x = np.random.uniform(0, 50, total_len)
    y = np.zeros_like(x)
    y[(x > 15) & (x < 35)] = 1
    return x, y


def plot_data(x, y, total_len=40, s=50):
    plt.figure(figsize=(5, 3))
    ax = sns.scatterplot(x=x, y=np.zeros(len(x)), hue=y, s=s)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, ["Sick", "Recover"])
    ax.set(xlabel="dose, mg")
    return ax


total_len = 40
x, y = generate_patients_data(total_len=total_len)
ax = plot_data(x, y, total_len=total_len)
plt.show()


def plot_data(x, y, total_len=40, s=50):
    plt.figure(figsize=(5, 3))
    ax = sns.scatterplot(x=x[0, :], y=x[1, :], hue=y, s=s)
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, ["Sick", "Recover"])
    ax.set(xlabel="Dose, mg")
    ax.set(ylabel="Dose$^2$")
    return ax


total_len = 40
x_1, y = generate_patients_data(total_len=total_len)
x_2 = x_1**2
x = np.vstack([x_1, x_2])

plot_data(x, y, total_len=40, s=50)
plt.show()


plot_data(x, y, total_len=40, s=50)

x_arr = np.linspace(0, 50, 50)
xs = [x[0, :][y == 1].min(), x[0, :][y == 1].max()]
ys = [x[1, :][y == 1].min(), x[1, :][y == 1].max()]

# Calculate the coefficients.
coefficients = np.polyfit(xs, ys, 1)

# Let's compute the values of the line...
polynomial = np.poly1d(coefficients)
y_axis = polynomial(x_arr)

# ...and plot the points and the line
plt.plot(x_arr, y_axis, "r--")
plt.show()


from sklearn.datasets import make_circles

x, y = make_circles(n_samples=500, factor=0.3, noise=0.05, random_state=42)


fig, ax = plt.subplots(1, 1, figsize=(5, 5))

sns.scatterplot(
    x=x[:, 0],
    y=x[:, 1],
    hue=y,
    s=50,
    ax=ax,
    palette=sns.color_palette(["#2DA9E1", "#F9B041"]),
)
plt.show()


from matplotlib.colors import ListedColormap
from sklearn.inspection import DecisionBoundaryDisplay


def plot_svm(x, y, clf):
    dull_cmap = ListedColormap(["#B8E1EC", "#FEE7D0"])
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))

    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        x,
        response_method="predict",
        cmap=dull_cmap,
        alpha=0.8,
        xlabel="feature 1",
        ylabel="feature 2",
        ax=ax,
    )

    sns.scatterplot(
        x=x[:, 0],
        y=x[:, 1],
        hue=y,
        s=50,
        ax=ax,
        palette=sns.color_palette(["#2DA9E1", "#F9B041"]),
    )
    plt.show()


from sklearn import svm

clf = svm.SVC(kernel="linear")
clf.fit(x, y)
plot_svm(x, y, clf)


clf = svm.SVC(kernel="poly")
clf.fit(x, y)
plot_svm(x, y, clf)


clf = svm.SVC(kernel="poly", degree=4)
clf.fit(x, y)
plot_svm(x, y, clf)


clf = svm.SVC(kernel="rbf")
clf.fit(x, y)
plot_svm(x, y, clf)


import sklearn.datasets

cancer = sklearn.datasets.load_breast_cancer()  # load data

x = cancer.data  # features
y = cancer.target  # labels(classes)
print(f"x shape: {x.shape}, y shape: {y.shape}")
print(f"x[0]: \n {x[0]}")
print(f"y[0]: \n {y[0]}")

x shape: (569, 30), y shape: (569,)
x[0]: 
 [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
y[0]: 
 0


import pandas as pd

cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_df


cancer_df.describe()


import seaborn as sns
from matplotlib import pyplot as plt


plt.figure(figsize=(6, 5))
ax = sns.boxenplot(
    data=cancer_df,
    orient="h",
    palette="Set2",
    linewidth=0.4,
    flier_kws={"marker": "o", "s": 3},
    line_kws={"linewidth": 1},
)
ax.set(xscale="log", xlim=(1e-4, 1e4), xlabel="Values", ylabel="Features")
plt.show()


import random

random.seed(0)
random_names = random.sample(list(cancer.feature_names), 8)
cut_df = cancer_df[random_names]


from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler


def plot_norm(df, ax, title):
    sns.boxenplot(
        df,
        orient="h",
        palette="Set2",
        ax=ax,
        linewidth=0.2,
        flier_kws={"marker": "o", "s": 5},
        line_kws={"linewidth": 1},
    )
    ax.set(xlabel="Values", title=title)


fig, axs = plt.subplots(2, 2, figsize=(10, 7))

plot_norm(cut_df, axs[0][0], "Original")
axs[0][0].set(xscale="log", xlim=(1e-4, 1e4))

min_max_x = MinMaxScaler().fit_transform(cut_df)
plot_norm(pd.DataFrame(min_max_x, columns=random_names), axs[0][1], "MinMax")

std_x = StandardScaler().fit_transform(cut_df)
plot_norm(pd.DataFrame(std_x, columns=random_names), axs[1][0], "Standard")

rob_x = RobustScaler().fit_transform(cut_df)
plot_norm(pd.DataFrame(rob_x, columns=random_names), axs[1][1], "Robust")

plt.subplots_adjust(wspace=0.55, hspace=0.35)

plt.show()


x_norm = StandardScaler().fit_transform(cancer_df)  # scaled data


pd.DataFrame(x_norm, columns=cancer.feature_names).describe()


plt.figure(figsize=(6, 5))
ax = sns.boxenplot(
    data=pd.DataFrame(x_norm, columns=cancer.feature_names),
    orient="h",
    palette="Set2",
    linewidth=0.4,
    flier_kws={"marker": "o", "s": 3},
    line_kws={"linewidth": 1},
)
ax.set(xlabel="Values")
plt.show()


import numpy as np

cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)

# Compute the correlation matrix
corr = cancer_df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(8, 6))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=1,
    vmin=-1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)
plt.show()


x = np.linspace(0, 2 * np.pi, 10)
y = np.sin(x) + np.random.normal(scale=0.25, size=len(x))
x_true = np.linspace(0, 2 * np.pi, 200)
y_true = np.sin(x_true)

plt.figure(figsize=(5, 3))
plt.scatter(x, y, s=50, facecolors="none", edgecolors="b", label="noisy data")
plt.plot(x_true, y_true, c="lime", label="ground truth")
plt.legend()
plt.show()


from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

x_train = x.reshape(-1, 1)

fig = plt.figure(figsize=(10, 5))

for i, degree in enumerate([0, 1, 3, 9]):
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

    model.fit(x_train, y)
    y_plot = model.predict(x_true.reshape(-1, 1))

    fig.add_subplot(2, 2, i + 1)
    plt.plot(x_true, y_plot, c="red", label=f"M={degree}")
    plt.scatter(x, y, s=50, facecolors="none", edgecolors="b")
    plt.plot(x_true, y_true, c="lime")
    plt.legend()
plt.show()


from sklearn.linear_model import Ridge

model = make_pipeline(PolynomialFeatures(9), LinearRegression())
model_ridge = make_pipeline(PolynomialFeatures(9), Ridge(alpha=0.1))

model.fit(x_train, y)
y_plot = model.predict(x_true.reshape(-1, 1))

model_ridge.fit(x_train, y)
y_plot_ridge = model_ridge.predict(x_true.reshape(-1, 1))

plt.figure(figsize=(5, 3))
plt.plot(x_true, y_plot, c="red", label=f"M={degree}")
plt.plot(x_true, y_plot_ridge, c="black", label=f"M={degree}, alpha=0.1")
plt.scatter(x, y, s=50, facecolors="none", edgecolors="b")
plt.plot(x_true, y_true, c="lime", label="ground truth")
plt.legend()
plt.show()

poly_coef = model[1].coef_

eq = f"y = {round(poly_coef[0], 2)}+{round(poly_coef[1], 2)}*x"
for i in range(2, 10):
    eq += f"+{round(poly_coef[i], 2)}*x^{i}"

print("Without regularization: ", eq)

poly_coef = model_ridge[1].coef_

eq = f"y = {round(poly_coef[0], 2)}+{round(poly_coef[1], 2)}*x"
for i in range(2, 10):
    eq += f"+{round(poly_coef[i], 2)}*x^{i}"

print("With regularization: ", eq)

Without regularization:  y = 0.0+-12.11*x+39.41*x^2+-46.31*x^3+28.6*x^4+-10.48*x^5+2.35*x^6+-0.32*x^7+0.02*x^8+-0.0*x^9
With regularization:  y = 0.0+0.37*x+0.52*x^2+0.23*x^3+-0.19*x^4+-0.21*x^5+0.17*x^6+-0.04*x^7+0.01*x^8+-0.0*x^9


import sklearn
from sklearn.datasets import load_wine

# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine

# Download dataset
features, class_labels = load_wine(
    return_X_y=True, as_frame=True
)  # also we can get data in Bunch (dictionary) or pandas DataFrame

wine_dataset = features
wine_dataset["target"] = class_labels

wine_dataset.head()


wine_dataset.target.unique()

array([0, 1, 2])


import seaborn as sns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.kdeplot(wine_dataset, x="alcohol", fill=True, ax=axes[0])
axes[0].set_title("p(alcohol=x)")

sns.kdeplot(
    wine_dataset,
    x="alcohol",
    hue="target",
    palette=sns.color_palette(["#5D5DA6", "#2DA9E1", "#F9B041"]),
    fill=True,
    ax=axes[1],
)
axes[1].set_title("p(alcohol=x|target=i)")
plt.show()


from sklearn.model_selection import train_test_split

# Split the data into train and test data
x_train, x_test, y_train, y_test = train_test_split(
    features.values, class_labels.values, test_size=0.25, random_state=42
)


from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB

# Train the model
model = GaussianNB()
model.fit(x_train, y_train)

# Calculate F1_score
pred = model.predict(x_test)
f1_score(y_test, pred, average="macro")

1.0


logits = [[
    5.1,  # cat
    3.2,  # car
    -1.7,  # frog
]]


import numpy as np

print("Predicted class = %i (Cat)" % (np.argmax(logits, axis=1).squeeze()))

Predicted class = 0 (Cat)


def softmax(logits):
    return np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)


print(softmax(logits))
print("Sum = %.2f" % np.sum(softmax(logits)))

[[0.86904954 0.12998254 0.00096793]]
Sum = 1.00


from warnings import simplefilter

simplefilter("ignore", category=RuntimeWarning)

f = np.array([123, 456, 789])
p = np.exp(f) / np.sum(np.exp(f))
print(f"logits = {f},\nprobabilities = {p}")

logits = [123 456 789],
probabilities = [ 0.  0. nan]


f = np.array([123, 456, 789])
f -= f.max()
p = np.exp(f) / np.sum(np.exp(f))
print(f"new logits = {f},\nprobabilities = {p}")

new logits = [-666 -333    0],
probabilities = [5.75274406e-290 2.39848787e-145 1.00000000e+000]


def KL_dist(q1, q2, p1=0.5, p2=0.5):
    return p1 * np.log(p1 / q1) + p2 * np.log(p2 / q2)


print(f"q1 = 0.1, q2 = 0.9, Dkl = {KL_dist(0.1, 0.9):0.3f}")
print(f"q1 = 0.2, q2 = 0.8, Dkl = {KL_dist(0.2, 0.8):0.3f}")
print(f"q1 = 0.3, q2 = 0.7, Dkl = {KL_dist(0.3, 0.7):0.3f}")
print(f"q1 = 0.4, q2 = 0.6, Dkl = {KL_dist(0.4, 0.6):0.3f}")
print(f"q1 = 0.5, q2 = 0.5, Dkl = {KL_dist(0.5, 0.5)}")

q1 = 0.1, q2 = 0.9, Dkl = 0.511
q1 = 0.2, q2 = 0.8, Dkl = 0.223
q1 = 0.3, q2 = 0.7, Dkl = 0.087
q1 = 0.4, q2 = 0.6, Dkl = 0.020
q1 = 0.5, q2 = 0.5, Dkl = 0.0


# normal coin
p1 = 0.5
p2 = 0.5

# fake coin
q1 = 0.2
q2 = 0.8


# Kullback–Leibler divergence
div_kl = p1 * np.log2(p1 / q1) + p2 * np.log2(p2 / q2)
print(f"Dkl(P||Q) = {div_kl:.3f}")

# Entropy normal coin
h_p = -p1 * np.log2(p1) - p2 * np.log2(p2)
print(f"H(P) = {h_p:.3f}")

# Entropy fake coin
h_q = -q1 * np.log2(q1) - q2 * np.log2(q2)
print(f"H(Q) = {h_q:.3f}")

# Cross-entropy
h_p_q = -p1 * np.log2(q1) - p2 * np.log2(q2)
print(f"H(P||Q) = {h_p_q:.3f}")
print(f"H(P||Q) = Dkl(P||Q) + H(P) = {h_p+div_kl:.3f}")

Dkl(P||Q) = 0.322
H(P) = 1.000
H(Q) = 0.722
H(P||Q) = 1.322
H(P||Q) = Dkl(P||Q) + H(P) = 1.322


def cross_entropy_loss(pred_prob, true_prob):
    return np.sum(-true_prob * np.log(pred_prob)) / pred_prob.shape[0]

# 3 classes 2 items
logits = np.array([
    [5.1, 3.2, -1.7], # one item
   #[2.1, 6.3, 1.5],  # second item
])

print(f"Logits = \n{logits}\n")

pred_prob = softmax(logits)
print(f"Predicted Probabilities = \n{pred_prob}\n")

# 3 classes 2 items
true_prob = np.array([
    [1.0, 0.0, 0.0],  # one item
   #[0.0, 1.0, 0.0],  # second item
])
print(f"True Probabilities = \n{true_prob}\n")

print(f"Cross-entropy loss = {cross_entropy_loss(pred_prob, true_prob):.3f}")

Logits = 
[[ 5.1  3.2 -1.7]]

Predicted Probabilities = 
[[0.86904954 0.12998254 0.00096793]]

True Probabilities = 
[[1. 0. 0.]]

Cross-entropy loss = 0.140


# Input batch of 2 vector with 4 elements
x = np.array([
    [1, 2, 3, 4],
    [1, -2, 0, 0]
])
# Weights
W = np.random.randn(3, 4)  # 3 class

# model output
logits = x.dot(W.T)
print("Scores(Logits) \n", logits, "\n")

# Probabilities
probs = softmax(logits)  # defined before
print("Probs \n", probs, "\n")

# Ground true classes
y = [0, 1]

# Derivative
dl_ds = probs.copy()
dl_ds[np.arange(len(y)), y] += -1  # substract one from true class prob
dW = x.T.dot(dl_ds)  # dot product with input

print("Grads dL/dW \n", dW)  # have same shape as W

Scores(Logits) 
 [[-1.83503357  1.34297936  8.0008737 ]
 [ 0.03926955 -1.67315842  0.40267293]] 

Probs 
 [[5.34243609e-05 1.28213223e-03 9.98664443e-01]
 [3.81877168e-01 6.89010238e-02 5.49221809e-01]] 

Grads dL/dW 
 [[-6.18069408e-01 -9.29816844e-01  1.54788625e+00]
 [-2.76364749e+00  1.86476222e+00  8.98885270e-01]
 [-2.99983973e+00  3.84639669e-03  2.99599333e+00]
 [-3.99978630e+00  5.12852892e-03  3.99465777e+00]]

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	...	25.380	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	...	24.990	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	...	23.570	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	...	14.910	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	...	22.540	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	...	25.450	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115
565	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	...	23.690	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637
566	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	...	18.980	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820
567	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	...	25.740	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400
568	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	...	9.456	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
count	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	...	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02
mean	-3.153111e-15	-6.568462e-15	-6.993039e-16	-8.553985e-16	6.081447e-15	-1.136369e-15	-2.997017e-16	1.023981e-15	-1.860648e-15	-1.504752e-15	...	-2.297713e-15	1.742016e-15	-1.198807e-15	6.118909e-16	-5.094929e-15	-2.122887e-15	6.118909e-16	-1.998011e-16	-2.422589e-15	2.497514e-15
std	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	...	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00
min	-2.029648e+00	-2.229249e+00	-1.984504e+00	-1.454443e+00	-3.112085e+00	-1.610136e+00	-1.114873e+00	-1.261820e+00	-2.744117e+00	-1.819865e+00	...	-1.726901e+00	-2.223994e+00	-1.693361e+00	-1.222423e+00	-2.682695e+00	-1.443878e+00	-1.305831e+00	-1.745063e+00	-2.160960e+00	-1.601839e+00
25%	-6.893853e-01	-7.259631e-01	-6.919555e-01	-6.671955e-01	-7.109628e-01	-7.470860e-01	-7.437479e-01	-7.379438e-01	-7.032397e-01	-7.226392e-01	...	-6.749213e-01	-7.486293e-01	-6.895783e-01	-6.421359e-01	-6.912304e-01	-6.810833e-01	-7.565142e-01	-7.563999e-01	-6.418637e-01	-6.919118e-01
50%	-2.150816e-01	-1.046362e-01	-2.359800e-01	-2.951869e-01	-3.489108e-02	-2.219405e-01	-3.422399e-01	-3.977212e-01	-7.162650e-02	-1.782793e-01	...	-2.690395e-01	-4.351564e-02	-2.859802e-01	-3.411812e-01	-4.684277e-02	-2.695009e-01	-2.182321e-01	-2.234689e-01	-1.274095e-01	-2.164441e-01
75%	4.693926e-01	5.841756e-01	4.996769e-01	3.635073e-01	6.361990e-01	4.938569e-01	5.260619e-01	6.469351e-01	5.307792e-01	4.709834e-01	...	5.220158e-01	6.583411e-01	5.402790e-01	3.575891e-01	5.975448e-01	5.396688e-01	5.311411e-01	7.125100e-01	4.501382e-01	4.507624e-01
max	3.971288e+00	4.651889e+00	3.976130e+00	5.250529e+00	4.770911e+00	4.568425e+00	4.243589e+00	3.927930e+00	4.484751e+00	4.910919e+00	...	4.094189e+00	3.885905e+00	4.287337e+00	5.930172e+00	3.955374e+00	5.112877e+00	4.700669e+00	2.685877e+00	6.046041e+00	6.846856e+00

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0

	Hours	Scores
0	2.5	21
1	5.1	47
2	3.2	27
3	8.5	75
4	3.5	30

Линейная регрессия¶

Модель и ее параметры¶

Функция потерь¶

Поиск локального минимума¶

Метод наименьших квадратов¶

Метрики регрессии¶

Модель линейной регрессии из библиотеки scikit-learn¶

Метод градиентного спуска¶

Градиент¶

Идея градиентного спуска¶

Выбор скорости обучения¶

Единый подход к учету смещения¶

Необходимость нормализации¶

Cтохастический градиентный спуск¶

Классификация¶

Hinge loss¶

1D классификация¶

Maximum Margin Classifier¶

2D классификация¶

SVM: Hard and Soft Margin Classifier¶

3D классификация¶

Многоклассовая классификация¶

One vs Rest¶

One vs One¶

Обобщенные линейные модели¶

Полиномиальная модель¶

Kernel SVM¶

Обоснование Kernel SVM¶

Примеры ядер¶

Практические особенности работы с линейными моделями¶

Нормализация данных¶

Выбор Scaler¶

Проблема корреляции признаков в случае линейных моделей¶

Регуляризация¶

Вероятностный подход в задаче классификации¶

Наивный Байесовский классификатор¶

Табличные данные: датасет Wine¶

NLP: задача определения спама¶

Кросс-энтропия как общая функция потерь для задач классификации¶

Переход к вероятностям¶

Sigmoid¶

SoftMax¶

Практическое вычисление SoftMax¶

Cross-entropy¶

Расстояние (дивергенция) Кульбака — Лейблера:¶

Переход к оценке модели¶

Энтропия и кросс-энтропия¶

Расчет функции потерь¶

Градиент функции потерь. Кросс-энтропия¶