import pandas as pd

# Download the data and save it in a variable called data
dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/titanic.csv"
)  # Load the data using pandas
dataset[:5]  # Show the first 5 lines


from sklearn.ensemble import RandomForestClassifier

x = dataset.drop("Survived", axis=1)  # drop target
y = dataset["Survived"]  # target

rf = RandomForestClassifier(random_state=42)

try:
    rf.fit(x, y)
except ValueError as e:
    print(e)

could not convert string to float: 'Braund, Mr. Owen Harris'


dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


print(dataset.SibSp[:5])

0    1
1    1
2    0
3    1
4    0
Name: SibSp, dtype: int64


print(dataset["Parch"].unique())

[0 1 2 5 3 4 6]


dataset[["Age", "Fare"]].head()


from sklearn.model_selection import train_test_split

x = dataset.drop(columns=["Survived", "PassengerId"])  # drop target and id
y = dataset["Survived"]  # target

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)
# drop categorical
x_train_working = x_train.drop(
    columns=["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
)
x_test_working = x_test.drop(
    columns=["Pclass", "Name", "Sex", "Ticket", "Cabin", "Embarked"]
)

rf = RandomForestClassifier(random_state=42)

try:
    rf.fit(x_train_working, y_train)
except ValueError as e:
    print(e)

x_train_working.head()

Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


from sklearn.metrics import accuracy_score

x_train_working = x_train_working.drop(columns=["Age"])
x_test_working = x_test_working.drop(columns=["Age"])

rf = RandomForestClassifier(random_state=42)

rf.fit(x_train_working, y_train)
y_pred = rf.predict(x_test_working)

print(accuracy_score(y_test, y_pred))

0.6759776536312849


print(dataset["Pclass"].unique())

[3 1 2]


print(dataset["Embarked"].unique())
# C = Cherbourg; Q = Queenstown; S = Southampton

['S' 'C' 'Q' nan]


print(dataset["Sex"].unique())

['male' 'female']


x_train_working["Pclass"] = x_train["Pclass"]
x_test_working["Pclass"] = x_test["Pclass"]

x_train_working[:5]


sex = {"male": 1, "female": 0}

x_train_working["Sex"] = x_train["Sex"].map(sex)
x_test_working["Sex"] = x_test["Sex"].map(sex)

x_train_working[:5]


import seaborn as sns
from matplotlib import pyplot as plt

plt.figure(figsize=(8, 4))

train_df = x_train.copy()
train_df["Survived"] = y_train
sns.barplot(x="Embarked", y="Survived", data=train_df)
plt.show()


import numpy as np

emb = {np.nan: 0, "S": 0, "Q": 1, "C": 2}

x_train_working["Embarked"] = x_train["Embarked"].map(emb)
x_test_working["Embarked"] = x_test["Embarked"].map(emb)

x_train_working[5:10]


x_train.Name[:5]

331                   Partner, Mr. Austen
733            Berriman, Mr. William John
382                    Tikkanen, Mr. Juho
704               Hansen, Mr. Henrik Juul
813    Andersson, Miss. Ebba Iris Alfrida
Name: Name, dtype: object


titles = dataset.Name.str.extract(" ([A-Za-z]+)\.", expand=False).unique()
print(titles)

['Mr' 'Mrs' 'Miss' 'Master' 'Don' 'Rev' 'Dr' 'Mme' 'Ms' 'Major' 'Lady'
 'Sir' 'Mlle' 'Col' 'Capt' 'Countess' 'Jonkheer']


titles = {
    None: 0,
    "Sir": 0,
    "Countess": 0,
    "Don": 0,
    "Jonkheer": 0,
    "Lady": 0,
    "Capt": 0,
    "Ms": 0,
    "Mme": 0,
    "Mlle": 0,
    "Col": 0,
    "Major": 0,
    "Rev": 0,
    "Dr": 0,
    "Master": 1,
    "Mrs": 2,
    "Miss": 3,
    "Mr": 4,
}

x_train_working["Title"] = x_train.Name.str.extract(" ([A-Za-z]+)\.", expand=False).map(
    titles
)
x_test_working["Title"] = x_test.Name.str.extract(" ([A-Za-z]+)\.", expand=False).map(
    titles
)

x_train_working[:5]


weekdays = np.arange(1, 8)  # create an array of weekdays
print(weekdays)
sina = np.sin(weekdays * np.pi * 2 / np.max(weekdays))  # feature 1
cosa = np.cos(weekdays * np.pi * 2 / np.max(weekdays))  # feature 2

[1 2 3 4 5 6 7]


plt.figure(figsize=(6, 6))  # Decide figure size
plt.scatter(sina, cosa)  # Plot scatter of feature 1 vs feature 2
for i, z in enumerate(
    ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")
):  # for each day in a week
    plt.text(sina[i], cosa[i], s=z)  # add text labels to plot


dist_mon_tue = (sina[1] - sina[0]) ** 2 + (
    cosa[1] - cosa[0]
) ** 2  # distance between Monday and Tuesday
dist_sun_mon = (sina[6] - sina[0]) ** 2 + (
    cosa[6] - cosa[0]
) ** 2  # distance between Sunday and Monday
print("Distance between Mon-Tue = %.2f" % dist_mon_tue)
print("Distance between Sun-Mon = %.2f" % dist_sun_mon)

Distance between Mon-Tue = 0.75
Distance between Sun-Mon = 0.75


dist_mon_wed = (sina[2] - sina[0]) ** 2 + (
    cosa[2] - cosa[0]
) ** 2  # distance between Monday and Wednesday
dist_fri_sun = (sina[4] - sina[6]) ** 2 + (
    cosa[4] - cosa[6]
) ** 2  # distance between Friday and Sunday
print("Distance between Mon-Wed = %.2f" % dist_mon_wed)
print("Distance between Fri-Sun = %.2f" % dist_fri_sun)

Distance between Mon-Wed = 2.45
Distance between Fri-Sun = 2.45


from sklearn.preprocessing import Binarizer

# fmt: off
x = np.array([[1, 12],
              [2, 7.6],
              [3, 8.4],
              [4, 13.5],
              [5, 6.3]])
# fmt: on

transformer = Binarizer(threshold=11.1)
binarized = transformer.transform(np.expand_dims(x[:, 1], axis=1))

x_binarized = np.concatenate((x, binarized), axis=1)

print(x_binarized)

[[ 1.  12.   1. ]
 [ 2.   7.6  0. ]
 [ 3.   8.4  0. ]
 [ 4.  13.5  1. ]
 [ 5.   6.3  0. ]]


# fmt: off
x = np.array([[1, 12.121143145],
              [2, 7.69458475974059],
              [3, 8.434243214],
              [4, 13.5958347545],
              [5, 6.3323294098]])
# fmt: on

round_func = np.around(np.expand_dims(x[:, 1], axis=1), decimals=1)
x_round = np.concatenate((np.expand_dims(x[:, 0], axis=1), round_func), axis=1)

print(x_round)

[[ 1.  12.1]
 [ 2.   7.7]
 [ 3.   8.4]
 [ 4.  13.6]
 [ 5.   6.3]]


fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

train_df = x_train.copy()
train_df["Survived"] = y_train

women = train_df[train_df["Sex"] == "female"]
men = train_df[train_df["Sex"] == "male"]

ax = sns.histplot(
    women[women["Survived"] == 1].Age.dropna(),
    bins=18,
    label="survived",
    ax=axes[0],
    kde=False,
    color="#27a9e1",
    linewidth=0,
)
ax = sns.histplot(
    women[women["Survived"] == 0].Age.dropna(),
    bins=40,
    label="not survived",
    ax=axes[0],
    kde=False,
    color="#ffab40",
    linewidth=0,
)
ax.legend()
ax.set_title("Female")

ax = sns.histplot(
    men[men["Survived"] == 1].Age.dropna(),
    bins=18,
    label="survived",
    ax=axes[1],
    kde=False,
    color="#27a9e1",
    linewidth=0,
)
ax = sns.histplot(
    men[men["Survived"] == 0].Age.dropna(),
    bins=40,
    label="not survived",
    ax=axes[1],
    kde=False,
    color="#ffab40",
    linewidth=0,
)
ax.legend()
ax.set_title("Male")
plt.show()


np.random.seed(42)

train_df = x_train.copy()
train_df["Title"] = x_train_working["Title"]

mean = {}
std = {}
for title in range(5):
    data = train_df.loc[train_df["Title"] == title]
    mean[title] = data["Age"].mean()
    std[title] = data["Age"].std()


def add_age_val(data, mean, std):
    for i, row in data.iterrows():
        if np.isnan(row["Age"]):
            title = int(row["Title"])
            data.loc[i, "Age"] = round(
                np.random.uniform(
                    low=int(mean[title] - std[title]),
                    high=int(mean[title] + std[title]),
                ),
                1,
            )
    return data


x_train_working["Age"] = x_train["Age"]
x_test_working["Age"] = x_test["Age"]

x_train_working = add_age_val(x_train_working, mean, std)
x_test_working = add_age_val(x_test_working, mean, std)

x_train_working[:5]


rf = RandomForestClassifier(random_state=42)

rf.fit(x_train_working, y_train)
y_pred = rf.predict(x_test_working)

print(accuracy_score(y_test, y_pred))

0.8212290502793296


from sklearn.datasets import make_circles

np.random.seed(42)

x, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=42)

plt.figure(figsize=(5, 5))
violet = y == 0
yellow = y == 1

plt.scatter(x[violet, 0], x[violet, 1], c="blueviolet", s=20, edgecolor="k")
plt.scatter(x[yellow, 0], x[yellow, 1], c="yellow", s=20, edgecolor="k")
plt.xlabel("$x_1$", fontsize=20)
plt.ylabel("$x_2$", fontsize=20)
plt.show()


from sklearn.linear_model import LogisticRegression

# We make a 80/20% train/test split of the data
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Make predictions
print("Accuracy of the model = %.2f" % model.score(x_test, y_test))

Accuracy of the model = 0.65


df = pd.DataFrame(x, columns=["x_1", "x_2"])
df["z"] = x[:, 0] ** 2 + x[:, 1] ** 2
df


# We make a 80/20% train/test split of the data
x_train, x_test, y_train, y_test = train_test_split(
    df.values, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Make predictions
print("Accuracy of the model = %.2f" % model.score(x_test, y_test))

Accuracy of the model = 1.00


def plot_data(x, y, total_len=400, s=50, threshold=21.5):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(xs=x[:, 0], ys=x[:, 1], zs=x[:, 2], c=y, s=s)
    # plot the decision function
    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    # create grid to evaluate model
    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    ax.plot_surface(XX, YY, XX * YY * 0.2, alpha=0.2)
    ax.set(xlabel="$x_1$", ylabel="$x_2$", zlabel="$z$")
    return ax


total_len = 400

ax = plot_data(df.values, y, total_len=total_len)


dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/titanic.csv"
)  # Load the data using pandas
dataset[:5]  # Show the first 5 lines


# The categorical-to-numerical function
# Changed to automatically add column names
def cat_to_num(data):  # one-hot encoding
    categories = set(data)
    features = {}
    for cat in categories:
        binary = data == cat
        if len(set(binary)) == 1:
            # Ignore features where all values equal
            continue
        new_key = f"{data.name}={cat}"

        features[new_key] = binary.astype("int")
    return pd.DataFrame(features)


def cabin_features(data):
    features = []
    for cabin in data:
        cabins = str(cabin).split(" ")
        n_cabins = len(cabins)
        # First char is the cabin_char
        try:
            cabin_char = cabins[0][0]
        except IndexError:
            cabin_char = "X"
            n_cabins = 0
        # The rest is the cabin number
        try:
            cabin_num = int(cabins[0][1:])
        except:
            cabin_num = -1
        # Add 3 features for each passanger
        features.append([cabin_char, cabin_num, n_cabins])
    features = np.array(features)
    dic_of_features = {
        "Cabin_num": features[:, 1].astype("int"),
        "N_cabins": features[:, 2].astype("int"),
    }
    out = pd.DataFrame(dic_of_features)
    char_column = pd.DataFrame({"Cabin_char": features[:, 0]})
    cabin_ch = cat_to_num(char_column["Cabin_char"])
    return out.join(cabin_ch)


def prepare_data(data):
    """Takes a dataframe of raw data and returns ML model features"""

    # Initially, we build a model only on the available numerical values
    features = data.drop(
        [
            "PassengerId",
            "Survived",
            "Fare",
            "Name",
            "Sex",
            "Ticket",
            "Cabin",
            "Embarked",
        ],
        axis=1,
    )

    # Setting missing age values to -1
    features["Age"] = data["Age"].fillna(-1)

    # Adding the sqrt of the fare feature
    features["sqrt_Fare"] = np.sqrt(data["Fare"])

    # Adding gender categorical value
    features = features.join(cat_to_num(data["Sex"]))

    # Adding Embarked categorical value
    features = features.join(cat_to_num(data["Embarked"]))

    # Split cabin
    features = features.join(cabin_features(data["Cabin"]))

    return features


features = prepare_data(dataset)  # Create variable features
features[:5]  # Display first 5 rows


# We make a 80/20% train/test split of the data
features = prepare_data(dataset)
x_train, x_test, y_train, y_test = train_test_split(
    features, dataset["Survived"], test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Make predictions
print("Accuracy of the model = %.2f" % model.score(x_test, y_test))

Accuracy of the model = 0.78


from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder

# Make one shot encoded representation
one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(
            sparse_output=False,  # if False return array, if True return sparse matrix
            handle_unknown="ignore",
        ),  #  ignore if an unknown categorical feature is present during transform
        make_column_selector(dtype_include="category"),
    ),  # selection of dtypes to include
    remainder="passthrough",
)  # all columns that were not specified in transformers will be  passed through


# preprocessing features using sklearn.preprocessing
features = dataset.drop(
    ["PassengerId", "Survived", "Fare", "Name", "Sex", "Ticket", "Cabin", "Embarked"],
    axis=1,
)
# make Cabin features, examples: None -> 'X', C85 -> 'C', B42 -> 'B'
features["Cabin"] = (
    dataset["Cabin"].fillna("X").apply(lambda x: x[0]).astype("category")
)


def get_cab_num(cab):
    try:
        return int(cab.split()[0][1:])  # get cabin num (C85 -> 85)
    except:
        return -1  # if dont know num, return -1 (X -> -1)


features["Cabin_num"] = (
    dataset["Cabin"].fillna("X").apply(lambda x: get_cab_num(x))
)  # get cabin num

features["N_cabins"] = (
    dataset["Cabin"].fillna("X").str.split(" ").apply(lambda x: len(x))
)  # num of cabins (C23 C25 C27 -> 3)

features["Sex"] = dataset["Sex"].astype("category")  # male/female

features["Embarked"] = (
    dataset["Embarked"].fillna("X").astype("category")
)  # Categories: ['C', 'Q', 'S', 'X']
features["sqrt_Fare"] = np.sqrt(dataset["Fare"])  # normalize by sqrt
features["Age"] = dataset["Age"].fillna(-1)  # NaN -> -1


# 80/20% train/test split of the data
x_train, x_test, y_train, y_test = train_test_split(
    features, dataset["Survived"], test_size=0.2, random_state=42
)


one_hot_encoder.fit(x_train)  # fit one-hot encoder to x_train
x_train_ohe = one_hot_encoder.transform(
    x_train
)  # transform x_train with the one-hot encoder
x_test_ohe = one_hot_encoder.transform(
    x_test
)  # transform x_test with the one-hot encoder


model = LogisticRegression(max_iter=1000)  # specify maximum iterations
model.fit(x_train_ohe, y_train)  # fit model to the training data

# Make predictions
print(
    "Accuracy of the model = %.2f" % model.score(x_test_ohe, y_test)
)  # calculate the accuracy of the model

Accuracy of the model = 0.78


### https://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html#sphx-glr-auto-examples-ensemble-plot-feature-transformation-py
from sklearn.datasets import make_classification

np.random.seed(42)

# define dummy dataset
x, y = make_classification(n_samples=80000, random_state=42)

# split dataset into subsets for training ensemble and linear model and final testing of the linear model
x_full_train, x_test, y_full_train, y_test = train_test_split(
    x, y, test_size=0.5, random_state=42
)

# split training subset into parts for ensemble training and for linear model training
x_train_ensemble, x_train_linear, y_train_ensemble, y_train_linear = train_test_split(
    x_full_train, y_full_train, test_size=0.5, random_state=42
)


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve

n_estimator = 10

# Supervised transformation based on random forests
rf = RandomForestClassifier(max_depth=3, n_estimators=n_estimator, random_state=42)
rf_enc = OneHotEncoder()
rf_lm = LogisticRegression(max_iter=1000)

rf.fit(x_train_ensemble, y_train_ensemble)
rf_enc.fit(rf.apply(x_train_ensemble))  # apply method return leaf indices
rf_lm.fit(rf_enc.transform(rf.apply(x_train_linear)), y_train_linear)

y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(x_test)))[:, 1]
fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm)


# Supervised transformation based on gradient boosted trees
grd = GradientBoostingClassifier(n_estimators=n_estimator, random_state=42)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression(max_iter=1000)

grd.fit(x_train_ensemble, y_train_ensemble)
grd_enc.fit(grd.apply(x_train_ensemble)[:, :, 0])  # apply method return leaf indices
grd_lm.fit(grd_enc.transform(grd.apply(x_train_linear)[:, :, 0]), y_train_linear)

y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(x_test)[:, :, 0]))[
    :, 1
]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)


# The random forest model by itself
y_pred_rf = rf.predict_proba(x_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

# The gradient boosted model by itself
y_pred_grd = grd.predict_proba(x_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)


# Plot figure 1 and figure 2 with subplots
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))

ax1.plot([0, 1], [0, 1], "k--")
ax1.plot(fpr_rf, tpr_rf, label="RF")
ax1.plot(fpr_rf_lm, tpr_rf_lm, label="RF + LR")
ax1.plot(fpr_grd, tpr_grd, label="GBT")
ax1.plot(fpr_grd_lm, tpr_grd_lm, label="GBT + LR")
ax1.set_xlabel("False positive rate")
ax1.set_ylabel("True positive rate")
ax1.set_title("ROC curve")
ax1.legend(loc="best")

ax2.set_xlim(0, 0.2)
ax2.set_ylim(0.8, 1)
ax2.plot([0, 1], [0, 1], "k--")
ax2.plot(fpr_rf, tpr_rf, label="RF")
ax2.plot(fpr_rf_lm, tpr_rf_lm, label="RF + LR")
ax2.plot(fpr_grd, tpr_grd, label="GBT")
ax2.plot(fpr_grd_lm, tpr_grd_lm, label="GBT + LR")
ax2.set_xlabel("False positive rate")
ax2.set_ylabel("True positive rate")
ax2.set_title("ROC curve (zoomed in at top left)")
ax2.legend(loc="best")

plt.show()


import pandas as pd

dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/titanic.csv"
)  # Load the data using pandas
dataset.head(5)  # Show the first 5 lines


import numpy as np


# The categorical-to-numerical function
# Changed to automatically add column names
def cat_to_num(data):  # one-hot encoding
    categories = set(data)
    features = {}
    for cat in categories:
        binary = data == cat
        if len(set(binary)) == 1:
            # Ignore features where all values equal
            continue
        new_key = f"{data.name}={cat}"

        features[new_key] = binary.astype("int")
    return pd.DataFrame(features)


def cabin_features(data):
    features = []
    for cabin in data:
        cabins = str(cabin).split(" ")
        n_cabins = len(cabins)
        # First char is the cabin_char
        try:
            cabin_char = cabins[0][0]
        except IndexError:
            cabin_char = "X"
            n_cabins = 0
        # The rest is the cabin number
        try:
            cabin_num = int(cabins[0][1:])
        except:
            cabin_num = -1
        # Add 3 features for each passanger
        features.append([cabin_char, cabin_num, n_cabins])
    features = np.array(features)
    dic_of_features = {
        "Cabin_num": features[:, 1].astype("int"),
        "N_cabins": features[:, 2].astype("int"),
    }
    out = pd.DataFrame(dic_of_features)
    char_column = pd.DataFrame({"Cabin_char": features[:, 0]})
    cabin_ch = cat_to_num(char_column["Cabin_char"])
    return out.join(cabin_ch)


def prepare_data(data):
    """Takes a dataframe of raw data and returns ML model features"""

    # Initially, we build a model only on the available numerical values
    features = data.drop(
        [
            "PassengerId",
            "Survived",
            "Fare",
            "Name",
            "Sex",
            "Ticket",
            "Cabin",
            "Embarked",
        ],
        axis=1,
    )

    # Setting missing age values to -1
    features["Age"] = data["Age"].fillna(-1)

    # Adding the sqrt of the fare feature
    features["sqrt_Fare"] = np.sqrt(data["Fare"])

    # Adding gender categorical value
    features = features.join(cat_to_num(data["Sex"]))

    # Adding Embarked categorical value
    features = features.join(cat_to_num(data["Embarked"]))

    # Split cabin
    features = features.join(cabin_features(data["Cabin"]))

    return features


features = prepare_data(dataset)  # Create variable features
features.head(5)  # Display first 5 rows


from scipy import stats
from sklearn.model_selection import train_test_split

features = prepare_data(dataset)  # produce feature
x_train, x_test, y_train, y_test = train_test_split(
    features, dataset["Survived"], test_size=0.2, random_state=42
)

correlations = []  # create a storage for correlations
for column in features:
    r, p_value = stats.pearsonr(x_train[column], y_train)  # compute Pearson and R
    correlations.append((column, abs(r)))  # add to storage

df = pd.DataFrame(correlations, columns=["Column", "Correlation"]).sort_values(
    "Correlation", ascending=False
)
df.head(df.shape[0])


from sklearn.metrics import roc_auc_score

features = prepare_data(dataset)
x_train, x_test, y_train, y_test = train_test_split(
    features, dataset["Survived"], test_size=0.2, random_state=42
)


rocs = []  # create a storage for ROCs
for column in features:
    # use feature as score directly
    r1 = roc_auc_score(y_score=x_train[column], y_true=y_train)
    # use feature as score in reversed manner
    r2 = roc_auc_score(y_score=-x_train[column], y_true=y_train)
    r = max(r1, r2)
    rocs.append((column, r))

df = pd.DataFrame(rocs, columns=["Column", "Rocs"]).sort_values(
    "Rocs", ascending=False
)  # sort from highest to lowest
df.head(df.shape[0])


import joblib
import sys

sys.modules["sklearn.externals.joblib"] = joblib


from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

sfs = SequentialFeatureSelector(
    LogisticRegression(max_iter=1000),
    k_features=8,  # number of features to select
    forward=True,
    floating=False,
    scoring="accuracy",
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
)
sfs.fit(x_train, y_train)

df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
df.head(df.shape[0])


import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

plot_sfs(sfs.get_metric_dict(), kind="std_dev")

plt.title("Sequential Forward Selection (StdDev)")
plt.grid()
plt.show()


sffs = SequentialFeatureSelector(
    LogisticRegression(max_iter=1000),  # represents the classifier
    k_features=8,  # the number of features you want to select
    forward=True,  # add features
    floating=True,  # remove features
    scoring="accuracy",  # means that the selection will be decided by the accuracy of the classifier.
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
)

sffs.fit(x_train.values, y_train)  # performs the actual SFFS algorithm
df = pd.DataFrame.from_dict(sffs.get_metric_dict()).T
df.head(df.shape[0])


plot_sfs(sffs.get_metric_dict(), kind="std_dev")

plt.title("Sequential Forward Selection (StdDev)")
plt.grid()
plt.show()


lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)

df = pd.DataFrame(lr.coef_[0], x_train.columns, columns=["Coef"]).sort_values(
    "Coef", key=abs, ascending=False
)
df.head(df.shape[0])


from sklearn.feature_selection import SelectFromModel

# 1. A SelectFromModel instance selects the features
# whose coefficients are non-zero when the feature is included in the model.
# 2. The LogisticRegression instance runs the logistic regression
# algorithm on the training data.

# selecting features based on importance weights
lr_selector = SelectFromModel(LogisticRegression(max_iter=1000))
lr_selector.fit(x_train, y_train)

SelectFromModel(estimator=LogisticRegression(max_iter=1000))

SelectFromModel(estimator=LogisticRegression(max_iter=1000))

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)


x_train.columns[lr_selector.get_support()]  # Get a mask of the features selected

Index(['Pclass', 'Sex=male', 'Sex=female', 'N_cabins', 'Cabin_char=F',
       'Cabin_char=G', 'Cabin_char=C', 'Cabin_char=E'],
      dtype='object')


lr_selector.transform(x_train)  # select only relevant features

array([[1., 1., 0., ..., 0., 1., 0.],
       [2., 1., 0., ..., 0., 0., 0.],
       [3., 1., 0., ..., 0., 0., 0.],
       ...,
       [3., 1., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.]])


from sklearn.ensemble import RandomForestClassifier

# select features with RFC
rf = RandomForestClassifier(n_estimators=500, random_state=42)

rf_selector = SelectFromModel(rf)
rf_selector.fit(x_train, y_train)  # Fit it on the training data

x_train.columns[rf_selector.get_support()]

Index(['Pclass', 'Age', 'sqrt_Fare', 'Sex=male', 'Sex=female', 'Cabin_num'], dtype='object')


from sklearn.inspection import permutation_importance

model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(x_train, y_train)

r = permutation_importance(model, x_test, y_test, n_repeats=100, random_state=42)

df = pd.DataFrame({"name": x_train.columns, "imp": r.importances_mean}).sort_values(
    "imp", ascending=False
)
df


import seaborn as sns

plt.figure(figsize=(8, 8))
sns.barplot(data=df, y="name", x="imp", color="blue", orient="h")
plt.show()


from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

r = permutation_importance(model, x_test, y_test, n_repeats=100, random_state=42)

df = pd.DataFrame({"name": x_train.columns, "imp": r.importances_mean}).sort_values(
    "imp", ascending=False
)
df


plt.figure(figsize=(8, 8))
sns.barplot(data=df, y="name", x="imp", color="blue", orient="h")
plt.show()


model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500, random_state=42)

RandomForestClassifier(n_estimators=500, random_state=42)


pred = model.predict_proba(x_test)[:, 1]
r1 = roc_auc_score(y_score=pred, y_true=y_test)

print(f"ROC-AUC: {r1:.4f}")

ROC-AUC: 0.8894


!pip install -q boruta

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.6/56.6 kB 5.6 MB/s eta 0:00:00


print(type(x_train))
print(type(x_train.values))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


from boruta import BorutaPy

# define Boruta feature selection method
model = RandomForestClassifier(n_estimators=500, random_state=42)


feat_selector = BorutaPy(model, n_estimators=100, verbose=2, random_state=42)

# find all relevant features
feat_selector.fit(x_train.values, y_train.values)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	21
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	17
Iteration: 	9 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	17
Iteration: 	10 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	17
Iteration: 	11 / 100
Confirmed: 	3
Tentative: 	1
Rejected: 	17
Iteration: 	12 / 100
Confirmed: 	3
Tentative: 	0
Rejected: 	18


BorutaPy finished running.

Iteration: 	13 / 100
Confirmed: 	3
Tentative: 	0
Rejected: 	18

BorutaPy(estimator=RandomForestClassifier(random_state=RandomState(MT19937) at 0x7ABE867D0F40),
         n_estimators=100, random_state=RandomState(MT19937) at 0x7ABE867D0F40,
         verbose=2)

BorutaPy(estimator=RandomForestClassifier(random_state=RandomState(MT19937) at 0x7ABE867D0F40),
         n_estimators=100, random_state=RandomState(MT19937) at 0x7ABE867D0F40,
         verbose=2)

RandomForestClassifier(random_state=RandomState(MT19937) at 0x7ABE867D0F40)

RandomForestClassifier(random_state=RandomState(MT19937) at 0x7ABE867D0F40)


# zip my names, ranks, and decisions in a single iterable
feature_ranks = list(
    zip(x_train.columns, feat_selector.ranking_, feat_selector.support_)
)

# iterate through print out the results and remove features with low rank
for feat in feature_ranks:
    print("Feature: {:<25} Rank: {},  Keep: {}".format(feat[0], feat[1], feat[2]))
    if feat[2] == False:
        del x_train[feat[0]]

Feature: Pclass                    Rank: 3,  Keep: False
Feature: Age                       Rank: 2,  Keep: False
Feature: SibSp                     Rank: 5,  Keep: False
Feature: Parch                     Rank: 6,  Keep: False
Feature: sqrt_Fare                 Rank: 1,  Keep: True
Feature: Sex=male                  Rank: 1,  Keep: True
Feature: Sex=female                Rank: 1,  Keep: True
Feature: Embarked=C                Rank: 9,  Keep: False
Feature: Embarked=S                Rank: 8,  Keep: False
Feature: Embarked=Q                Rank: 11,  Keep: False
Feature: Cabin_num                 Rank: 4,  Keep: False
Feature: N_cabins                  Rank: 15,  Keep: False
Feature: Cabin_char=T              Rank: 19,  Keep: False
Feature: Cabin_char=F              Rank: 16,  Keep: False
Feature: Cabin_char=G              Rank: 18,  Keep: False
Feature: Cabin_char=C              Rank: 13,  Keep: False
Feature: Cabin_char=E              Rank: 11,  Keep: False
Feature: Cabin_char=B              Rank: 11,  Keep: False
Feature: Cabin_char=A              Rank: 17,  Keep: False
Feature: Cabin_char=n              Rank: 7,  Keep: False
Feature: Cabin_char=D              Rank: 14,  Keep: False


x_train


# Build the model with the random forest algorithm

model = RandomForestClassifier(n_estimators=500, random_state=42)
model.fit(x_train, y_train)

pred = model.predict_proba(x_test[x_train.columns])[:, 1]
r1 = roc_auc_score(y_score=pred, y_true=y_test)

print(f"ROC-AUC: {r1:.4f}")

ROC-AUC: 0.8243


np.random.seed(42)


pat_cnt = 500  # patients
snv_count = 100000  # all features(binary)

genes = [f"SNP{ind}" for ind in range(snv_count)]  # features names

# Generate 2 data sets, healthy and diseased patients.
# Each data set is a binary vector of length `snv_count`,
# in other words a SNV count vector of length 100000.

genes = [f"SNP{ind}" for ind in range(snv_count)]
healthy = pd.DataFrame(
    np.random.choice([0, 1], size=(pat_cnt, snv_count)), columns=genes
)
# We add a `State` column, indicating whether it's healthy or diseased.
healthy["State"] = "H"
diseased = pd.DataFrame(
    np.random.choice([0, 1], size=(pat_cnt, snv_count)), columns=genes
)
diseased["State"] = "D"

patients = pd.concat([healthy, diseased], axis=0)

# We drop the State column to get a `x` and a `y` matrix.
x = patients.drop("State", axis=1)
y = patients["State"]


x.head()


from sklearn.metrics import average_precision_score, accuracy_score

# 1. Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y == "D", test_size=0.3, random_state=42
)

# 2. Train a logistic regression model on the train set
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# 3. Predict the probabilities for train and test sets
# 4. Calculate ROCAUC and PRAUC scores for the prediction of train and test sets
# 5. Compare the performance of the model on train and test sets using the scores

y_train_pred = model.predict_proba(x_train)[:, 1]
train_rocauc = roc_auc_score(y_score=y_train_pred, y_true=y_train)
train_prauc = average_precision_score(y_score=y_train_pred, y_true=y_train)
train_accuracy = accuracy_score(y_pred=y_train_pred > 0.5, y_true=y_train)
print("Train quality:")
print(f"ROCAUC : {train_rocauc:.02f}")
print(f"PRAUC : {train_prauc:.02f}")
print(f"Accuracy:  {train_accuracy:.02f}")
# Test
y_test_pred = model.predict_proba(x_test)[:, 1]
test_rocauc = roc_auc_score(y_score=y_test_pred, y_true=y_test)
test_prauc = average_precision_score(y_score=y_test_pred, y_true=y_test)
test_accuracy = accuracy_score(y_pred=y_test_pred > 0.5, y_true=y_test)
print("\nTest quality:")
print(f"ROCAUC : {test_rocauc:.02f}")
print(f"PRAUC : {test_prauc:.02f}")
print(f"Accuracy:  {test_accuracy:.02f}")

Train quality:
ROCAUC : 1.00
PRAUC : 1.00
Accuracy:  1.00

Test quality:
ROCAUC : 0.49
PRAUC : 0.52
Accuracy:  0.48


# 1. Take the mean of all the reads for each gene in healthy
#  and each gene in diesised.
# 2. Subtract the mean number of reads for each gene in diesised
# from the mean number of reads for each gene in healthy.

diffs = x[y == "H"].mean(axis=0) - x[y == "D"].mean(axis=0)
# 3. Look at the top k most different genes
# by sorting the values in the resulting array from largest to smallest.
top = np.abs(diffs).sort_values(ascending=False)[0:10]
genes = top.index

# Print the gene names of the top k genes.
print("Genes", genes)

# Select x
x_selected = x[genes]

Genes Index(['SNP3660', 'SNP54022', 'SNP96099', 'SNP77184', 'SNP71144', 'SNP70126',
       'SNP14768', 'SNP63912', 'SNP17706', 'SNP32249'],
      dtype='object')


# 1. Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(
    x_selected, y == "D", test_size=0.3, random_state=42
)
# 2. Train a logistic regression model on the train set
model = LogisticRegression()
model.fit(x_train, y_train)

# 3. Predict the probabilities for train and test sets
# 4. Calculate ROCAUC and PRAUC scores for the prediction of train and test sets
# 5. Compare the performance of the model on train and test sets using the scores
y_train_pred = model.predict_proba(x_train)[:, 1]
train_rocauc = roc_auc_score(y_score=y_train_pred, y_true=y_train)
train_prauc = average_precision_score(y_score=y_train_pred, y_true=y_train)
train_accuracy = accuracy_score(y_pred=y_train_pred > 0.5, y_true=y_train)
print("Train quality:")
print(f"ROCAUC : {train_rocauc:.02f}")
print(f"PRAUC : {train_prauc:.02f}")
print(f"Accuracy: accuracy {train_accuracy:.02f}")
# Test
y_test_pred = model.predict_proba(x_test)[:, 1]
train_rocauc = roc_auc_score(y_score=y_test_pred, y_true=y_test)
train_prauc = average_precision_score(y_score=y_test_pred, y_true=y_test)
train_accuracy = accuracy_score(y_pred=y_test_pred > 0.5, y_true=y_test)
print("\nTest quality:")
print(f"ROCAUC : {train_rocauc:.02f}")
print(f"PRAUC : {train_prauc:.02f}")
print(f"Accuracy: accuracy {train_accuracy:.02f}")

Train quality:
ROCAUC : 0.72
PRAUC : 0.71
Accuracy: accuracy 0.67

Test quality:
ROCAUC : 0.72
PRAUC : 0.70
Accuracy: accuracy 0.65


# Split the data into train and test sets (with two sizes)
x_fs_train, x_test, y_fs_train, y_test = train_test_split(
    x, y == "D", test_size=0.3, random_state=42
)
# split again
x_fs, x_train, y_fs, y_train = train_test_split(
    x_fs_train, y_fs_train, test_size=0.8, random_state=42
)


# 1. Find the difference between the mean expression
#    of the genes
# 2. Sort the resulting list according to the difference in means
#    (from greatest difference to least)
# 3. Take the top K genes and return them

diffs = x_fs[np.logical_not(y_fs)].mean(axis=0) - x_fs[y_fs].mean(axis=0)
top = np.abs(diffs).sort_values(ascending=False)[0:10]
genes = top.index


model = LogisticRegression()
model.fit(x_train[genes], y_train)
y_train_pred = model.predict_proba(x_train[genes])[:, 1]

y_train_pred = model.predict_proba(x_train[genes])[:, 1]
train_rocauc = roc_auc_score(y_score=y_train_pred, y_true=y_train)
train_prauc = average_precision_score(y_score=y_train_pred, y_true=y_train)
train_accuracy = accuracy_score(y_pred=y_train_pred > 0.5, y_true=y_train)
print("Train quality:")
print(f"ROCAUC : {train_rocauc:.02f}")
print(f"PRAUC : {train_prauc:.02f}")
print(f"Accuracy: accuracy {train_accuracy:.02f}")

Train quality:
ROCAUC : 0.57
PRAUC : 0.56
Accuracy: accuracy 0.56


y_test_pred = model.predict_proba(x_test[genes])[:, 1]
train_rocauc = roc_auc_score(y_score=y_test_pred, y_true=y_test)
train_prauc = average_precision_score(y_score=y_test_pred, y_true=y_test)
train_accuracy = accuracy_score(y_pred=y_test_pred > 0.5, y_true=y_test)
print("Test quality:")
print(f"ROCAUC : {train_rocauc:.02f}")
print(f"PRAUC : {train_prauc:.02f}")
print(f"Accuracy: {train_accuracy:.02f}")

Test quality:
ROCAUC : 0.52
PRAUC : 0.50
Accuracy: 0.52


import pandas as pd
import numpy as np


# The categorical-to-numerical function
# Changed to automatically add column names
def cat_to_num(data):  # one-hot encoding
    categories = set(data)
    features = {}
    for cat in categories:
        binary = data == cat
        if len(set(binary)) == 1:
            # Ignore features where all values equal
            continue
        new_key = f"{data.name}={cat}"

        features[new_key] = binary.astype("int")
    return pd.DataFrame(features)


def cabin_features(data):
    features = []
    for cabin in data:
        cabins = str(cabin).split(" ")
        n_cabins = len(cabins)
        # First char is the cabin_char
        try:
            cabin_char = cabins[0][0]
        except IndexError:
            cabin_char = "X"
            n_cabins = 0
        # The rest is the cabin number
        try:
            cabin_num = int(cabins[0][1:])
        except:
            cabin_num = -1
        # Add 3 features for each passanger
        features.append([cabin_char, cabin_num, n_cabins])
    features = np.array(features)
    dic_of_features = {
        "Cabin_num": features[:, 1].astype("int"),
        "N_cabins": features[:, 2].astype("int"),
    }
    out = pd.DataFrame(dic_of_features)
    char_column = pd.DataFrame({"Cabin_char": features[:, 0]})
    cabin_ch = cat_to_num(char_column["Cabin_char"])
    return out.join(cabin_ch)


def prepare_data(data):
    """Takes a dataframe of raw data and returns ML model features"""

    # Initially, we build a model only on the available numerical values
    features = data.drop(
        [
            "PassengerId",
            "Survived",
            "Fare",
            "Name",
            "Sex",
            "Ticket",
            "Cabin",
            "Embarked",
        ],
        axis=1,
    )

    # Setting missing age values to -1
    features["Age"] = data["Age"].fillna(-1)

    # Adding the sqrt of the fare feature
    features["sqrt_Fare"] = np.sqrt(data["Fare"])

    # Adding gender categorical value
    features = features.join(cat_to_num(data["Sex"]))

    # Adding Embarked categorical value
    features = features.join(cat_to_num(data["Embarked"]))

    # Split cabin
    features = features.join(cabin_features(data["Cabin"]))

    return features


from sklearn.model_selection import train_test_split

# 1. Importing the data from the .csv
# 2. Pre-processing the data and creating a feature set
# 3. Splitting the data into training and test data and labels

dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/titanic.csv"
)
features = prepare_data(dataset)
x_train, x_test, y_train, y_test = train_test_split(
    features, dataset["Survived"], test_size=0.2, random_state=42
)


import sklearn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# performing PCA with default number of principal components.
titanic_pca = sklearn.decomposition.PCA()
titanic_pca.fit(x_train)  # fitting our PCA model with the training data.
# calculating the explained variance of each of the components.
evr = titanic_pca.explained_variance_ratio_

# We are plotting the explained variance ratios.
plt.bar(range(evr.shape[0]), evr)
plt.title("Variance by components")
plt.xlabel("Components")
plt.ylabel("Variance")
plt.show()


titanic_pca.explained_variance_.shape

(21,)


from sklearn.linear_model import LogisticRegression

# 1.The first thing we do is to fit a PCA model to the training set
#   of the Titanic dataset with n components from 1 to 10.
# 2.We then fit a logistic regression model to the transformed training data
#   and make predictions on the transformed test set.

for i in range(1, 11):
    titanic_pca = sklearn.decomposition.PCA(n_components=i)
    titanic_pca.fit(x_train)

    x_train_reduced = titanic_pca.transform(x_train)
    model = LogisticRegression(max_iter=1000)
    model.fit(x_train_reduced, y_train)

    x_test_reduced = titanic_pca.transform(x_test)

    # prints the number of PCA components and the score
    # for the corresponding model.
    print("%i first components %.2f" % (i, model.score(x_test_reduced, y_test)))

1 first components 0.61
2 first components 0.62
3 first components 0.68
4 first components 0.68
5 first components 0.75
6 first components 0.78
7 first components 0.78
8 first components 0.79
9 first components 0.79
10 first components 0.79


# First, we import the StandardScaler module.
from sklearn.preprocessing import StandardScaler

# Next, we create a StandardScaler object called scaler by calling the
# StandardScaler() function.
scaler = StandardScaler()

#  We then fit the scaling model to our training data.
x_train = scaler.fit_transform(x_train)

# We transform your test set by applying the same scaling model.
x_test = scaler.transform(x_test)

# performing PCA with default number of principal components.
titanic_pca = sklearn.decomposition.PCA()
titanic_pca.fit(x_train)  # fitting our PCA model with the training data.

PCA()

PCA()


titanic_pca.components_.shape

(21, 21)


# calculating the explained variance of each of the components.
evr = titanic_pca.explained_variance_ratio_

# We are plotting the explained variance ratios.
plt.bar(range(evr.shape[0]), evr)
plt.title("Variance by components")
plt.xlabel("Components")
plt.ylabel("Variance")
plt.show()


# 1. Reading in the scRNAseq data.
scRNAseq = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/scRNAseq_CITEseq.txt",
    sep="\t",
)

x_scRNAseq = scRNAseq.iloc[:, 0:-1]  # features
y_scRNAseq = scRNAseq.iloc[:, -1]  # labels

# 2. taking the log of the data.
x_scRNAseq = np.log(x_scRNAseq + 1)
print(f"dataset shape: {scRNAseq.shape}")

dataset shape: (8617, 977)


# 1. We're calculating the explained variance ratio for
#    each component of the PCA.
# 2. We're plotting these ratios in a chart.

pca = PCA(n_components=x_train.shape[1])
pca.fit(x_train)

ths = 0.95
total_explained = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(6, 6))

plt.plot(np.arange(1, total_explained.shape[0] + 1), total_explained)
plt.axhline(xmin=0, xmax=1000, y=ths, c="red", ls="--")
chosen_number = np.where(total_explained >= 0.95)[0][0] + 1
plt.axvline(x=chosen_number, ymin=0, ymax=ths, c="red", ls="--")
plt.xticks(np.arange(1, x_train.shape[1]))
plt.ylabel("total sum of  proportion  of the explained variance")
plt.xlabel("Num of components", size=14)

plt.show()


# 1. First, we create a PCA object that you fit to the training data.
# 2. Then, we create a a scatter plot where we plot the explained
#    variance ratio as a function of the number of PCA components.
# 3. We also plot the explained variance ratio as a function of the
#    number of components, but with a smooth curve.
# 4. Finally, we show the plot.

n_comp = x_train.shape[1]

explained = pca.explained_variance_ratio_

plt.figure(figsize=(6, 6))
plt.scatter(np.arange(1, n_comp + 1), explained)
plt.plot(np.arange(1, n_comp + 1), explained)
plt.title("Dependence of  variance on the number of components", size=14)
plt.xlabel("Num of components", size=14)
plt.ylabel("proportion of the explained variance", size=14)
plt.xticks(np.arange(1, x_train.shape[1]))
plt.show()


from tqdm import tqdm
from scipy.stats import norm


def shuffle_dataset(dataset):
    random_data = {}
    for col in dataset.columns:
        random_data[col] = np.random.permutation(dataset.loc[:, col].values)
    random_data = pd.DataFrame(random_data)
    return random_data


def get_variance_by_chance(dataset, n_replics, n_components):
    variance_explained_by_chance = np.zeros((n_replics, n_components))
    for i in tqdm(range(n_replics)):
        random_data = shuffle_dataset(dataset)
        random_pca = PCA(n_components=n_components)
        random_pca.fit(random_data)
        variance_explained_by_chance[i, :] = random_pca.explained_variance_ratio_
    return variance_explained_by_chance


def get_pc_variance(dataset, n_components):
    pca = PCA(n_components=n_components)
    pca.fit(dataset)
    return pca.explained_variance_ratio_


def plot_mean_and_CI(
    ax,
    values,
    label,
    ci_level=0.95,
    alpha_transparency=0.5,
    color_mean=None,
    color_shading=None,
):
    mean = values.mean(axis=0)

    std = values.std(axis=0)
    n = values.shape[1]
    se = std / np.sqrt(n)

    q_alpha = (1 - ci_level) / 2
    ci_num = np.abs(norm.ppf(q_alpha, loc=0, scale=1))

    lb = mean - ci_num * se
    ub = mean + ci_num * se

    # plot the shaded range of the confidence intervals
    ax.fill_between(
        range(mean.shape[0]), ub, lb, color=color_shading, alpha=alpha_transparency
    )
    # plot the mean on top
    ax.plot(mean, c=color_mean, lw=3, label=label)


def plot_explained_variance(ax, variance):
    ax.plot(variance, label="real", lw=3)
    ax.scatter(np.arange(0, variance.shape[0]), variance)


def plot_variance_by_change(ax, variance_by_chance):
    plot_mean_and_CI(
        ax, variance_by_chance, label="chance", color_mean="red", color_shading="red"
    )


def calc_permutat_pval(real_values, permut_values, eps=None):
    eps = eps or (1 / (permut_values.shape[0] * 10))

    p_values = np.zeros_like(real_values)
    for i in range(0, p_values.shape[0], 1):
        p_values[i] = (permut_values[:, i] >= real_values[i]).mean() + eps
    return p_values


def plot_explained_vs_chance(
    ax, explained_variance, variance_by_chance, dataset_name, step=1
):
    plot_explained_variance(ax, explained_variance)
    plot_variance_by_change(ax, variance_by_chance)

    ax.set_title(f"PCA {dataset_name}", size=25)
    ax.set_xlabel("Component number", size=15)
    ax.set_ylabel("Explained variance ration", size=15)
    ax.set_xticks(np.arange(0, explained_variance.shape[0], step))
    ax.set_xticklabels(np.arange(1, explained_variance.shape[0] + 1, step), size=10)

    ax.tick_params(labelsize=10, size=8)
    ax.set_ylim(0, explained_variance[0] + 0.1)
    ax.legend(fontsize=15)


def plot_pval_plot(ax, p_values, dataset_name, alpha_level=0.05, logscale=True, step=1):
    if logscale:
        p_values = -np.log10(p_values)
        alpha_level = -np.log10(alpha_level)

    ax.set_title(f"PC significance, {dataset_name}", size=25)
    ax.plot(p_values, lw=3)
    ax.scatter(np.arange(0, p_values.shape[0]), p_values, lw=3)

    ax.set_xlabel("Component number", size=15)
    ax.set_ylabel("-log(pvalue + eps)", size=15)
    ax.set_xticks(np.arange(0, p_values.shape[0], step))

    ax.set_xticklabels(labels=np.arange(1, p_values.shape[0] + 1, step), size=10)
    ax.tick_params(labelsize=10, size=8)

    ax.hlines(
        y=alpha_level,
        xmin=0,
        xmax=p_values.shape[0],
        color="red",
        linestyles="dashed",
        lw=3,
    )


def pca_analysis(ax1, ax2, dataset, title, n_components=None, n_replics=1000, step=1):
    n_components = n_components or dataset.shape[1]
    explained_variance = get_pc_variance(dataset, n_components)
    variance_by_chance = get_variance_by_chance(dataset, n_replics, n_components)
    p_values = calc_permutat_pval(explained_variance, variance_by_chance)
    plot_explained_vs_chance(ax1, explained_variance, variance_by_chance, title)
    plot_pval_plot(ax2, p_values, title)


np.random.seed(42)
f, (ax1, ax2) = plt.subplots(2, 1)
f.set_figheight(7)
f.set_figwidth(7)
plt.subplots_adjust(top=1.7)
pca_analysis(ax1, ax2, pd.DataFrame(x_train), "Titanic", n_replics=10, n_components=10)

100%|██████████| 10/10 [00:00<00:00, 114.46it/s]


first_component = titanic_pca.components_[0]


import seaborn as sns

sns.set_style("whitegrid")

plt.figure(figsize=(7, 7))

b = sns.barplot(
    x=first_component,
    y=features.columns,
    orient="h",
    hue=[z < 0 for z in first_component],
    palette=["blue", "red"],
)
b.legend_.remove()
plt.show()


plt.figure(figsize=(7, 7))

sns.barplot(x=np.abs(first_component), y=features.columns, orient="h", color="blue")
plt.show()


# 1. Generate x with N rows and P columns using the normal distribution.
# 2. Make a new vector with a third element that's either 0 or 3.
# 3. Append this vector to x, so that we now have a Nxp matrix.

np.random.seed(seed=42)
N = 200
P = 5

x = np.random.normal(size=[N, P])
print("x before", x.shape)
x = np.append(x, np.random.choice([0, 3], size=[N, 1]), axis=1)
print("x after", x.shape)

x before (200, 5)
x after (200, 6)


# 1. Fitting PCA to the data, and reducing it to 2 components
# 2. Transforming that data using the PCA to get the low-dimensional representation
# 3. Plotting it with matplotlib

pca = PCA(2)
low_d = pca.fit_transform(x)
plt.figure(figsize=(7, 7))
plt.scatter(low_d[:, 0], low_d[:, 1])
plt.show()


# 1. Normalize the data
# 2. Take the output from the PCA function and assign it to our variable low_d
# 3. Plot the first two components of low_d as a scatter plot.

x_scaled = StandardScaler().fit_transform(x)
low_d = pca.fit_transform(x_scaled)
plt.figure(figsize=(7, 7))
plt.scatter(low_d[:, 0], low_d[:, 1])
plt.show()


# 1. First we import the pandas Python package.
# 2. Then we read in the file that contains the RNA-seq data.
#    The RNA-seq data is stored in a tab-delimited file (.tab extension),
#    which is why we use the read_table pandas method.
#    The read_table method is named this way because
#    it reads tab-delimited data by default.
#    We want to tell the read_table method that the data is tab delimited,
#    that is why we supply the '\t' argument.
# 3. We want to tell the read_table method that the first column
#    contains our sample IDs, which is the column with index 0.
#    We do this by telling it to use the index_col argument and
#    setting that equal to 0.
# 4. We don't actually need the header information in this specific file,
#    so we don't have to tell pandas to parse the header information.
# 5. We assign the RNA-seq data to an object named rnadata

rnadataset = pd.read_table(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/rnaseq_data.tab.txt",
    index_col=0,
    header=None,
)
print("dataset shape: ", rnadataset.shape)

dataset shape:  (358, 71586)


rnadataset.columns = list(rnadataset.columns[:-2]) + ["dataset", "sample type"]
rnadataset.head()


# We remove the dataset and sample type columns from the data frame and
# store the data frame in X
x = rnadataset.drop(labels=["dataset", "sample type"], axis=1)

# We store the dataset and sample type columns in the labels data frame
labels = rnadataset.loc[:, ["dataset", "sample type"]]


# Finding the top two principal components of the data
pca_decomposer = PCA(n_components=2)
pca_decomposer.fit(x)

PCA(n_components=2)

PCA(n_components=2)


# Run PCA on the features
x_reduced = pca_decomposer.transform(x)

# Display a scatterplot of the transformed dataset
plt.figure(figsize=(8, 6))
plt.title("PCA plot", size=24)
plt.xlabel("PC1", size=16)
plt.ylabel("PC2", size=16)
sns.scatterplot(x=x_reduced[:, 0], y=x_reduced[:, 1], hue=labels["sample type"]);


# Run PCA on the features
x_reduced = pca_decomposer.transform(x)

# Display a scatterplot of the transformed dataset
plt.figure(figsize=(8, 6))
plt.title("PCA plot", size=24)
plt.xlabel("PC1", size=16)
plt.ylabel("PC2", size=16)
sns.scatterplot(
    x=x_reduced[:, 0],
    y=x_reduced[:, 1],
    hue=labels["sample type"],
    style=labels["dataset"],
);


# http://conradsanderson.id.au/lfwcrop/ (LFWcrop Face Dataset, greyscale version)
# 1. Downloading the LFWcrop dataset from the website.
# 2. Unzipping the dataset from the downloaded file.
# 3. Opening the directory with the faces.

dir = "lfwcrop_grey/faces"

# http://conradsanderson.id.au/lfwcrop/ (LFWcrop Face Dataset, greyscale version)
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/lfwcrop_grey.zip
!unzip -q lfwcrop_grey.zip


import os

plt.rcParams["axes.grid"] = False


def show_faces(imgs, titles, h=64, w=64):
    plt.figure(figsize=(16, 4))
    for i in range(min(imgs.shape[0], 5)):
        plt.subplot(1, 5, i + 1)
        plt.imshow(imgs[i].reshape((h, w)), cmap=plt.cm.gray)
        plt.title(titles[i])


# Get first 1000 files
celebrity_photos = os.listdir(dir)[1:1001]
celebrity_imgs = [dir + "/" + photo for photo in celebrity_photos]
# Load images from disk
imgs = np.array([plt.imread(img) for img in celebrity_imgs], dtype=np.float64)
# Extract real celebrity name from file name
celebrity_names = [
    name[: name.find("0") - 1].replace("_", " ") for name in celebrity_photos
]
print(imgs[0].shape)
show_faces(imgs, celebrity_names)

(64, 64)


# Stretch to vector
x = imgs.reshape(imgs.shape[0], 64 * 64)
print(x.shape)
mean = np.mean(x, axis=0)
# Center: substract mean
centered_faces = x - mean
plt.imshow(mean.reshape(64, 64), cmap=plt.cm.gray)
plt.show()

(1000, 4096)


# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# n_components == min(n_samples, n_features)

# 1. Load the faces data.
# 2. Apply PCA.
# 3. Extract the principal components.
# 4. Reshape the principal components so that they have the right shape.
# 5. Display the reshaped principal components.

pca_faces = sklearn.decomposition.PCA()  # 1000x4096

pca_faces.fit(centered_faces)
eigenfaces = pca_faces.components_
reshaped_eigenfaces = eigenfaces.reshape((1000, 64, 64))
eigenface_titles = ["eigenface %d" % i for i in range(reshaped_eigenfaces.shape[0])]
show_faces(reshaped_eigenfaces, eigenface_titles)


def create_embedding(img, n_components):
    # Generate embedding for first image using only 10 first components
    img = img.reshape(64 * 64) - mean
    emb = np.dot(img, eigenfaces[:n_components].T)  # (1,4096) * (4096,1)
    # print(emb,emb.shape) # 10 - 500 numbers only!

    # Recover image from embedding
    recovered_img = np.dot(emb, eigenfaces[:n_components])
    recovered_img += mean  # shift by mean
    return emb, recovered_img


# Show images recovered from embeddings of various sizes
original_img = imgs[0]
titles = []
img_list = []
for n in [10, 25, 100, 500]:
    embedding, recovered = create_embedding(original_img, n)
    img_list.append(recovered)
    titles.append(f"Components {n}")
img_list.append(original_img)
titles.append("Original")

show_faces(np.array(img_list, dtype=object), titles)


# https://scikit-learn.org/stable/auto_examples/decomposition/plot_kernel_pca.html
from sklearn.datasets import make_circles

np.random.seed(42)

# 1. Make_circles creates a data set of 400 points that form concentric circles with a gap of 50 points.
# 2. The factor parameter controls the size of the inner circles.
# 3. The noise parameter controls the amount of noise added to the data.
# 4. The result is a 360-feature dataset of concentric circles with gaps.

x, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=42)


plt.figure(figsize=(5, 5))

plt.title("Original space")
reds = y == 0
blues = y == 1

plt.scatter(x[reds, 0], x[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(x[blues, 0], x[blues, 1], c="blue", s=20, edgecolor="k")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")
plt.show()


pca = PCA()
x_pca = pca.fit_transform(x)
plt.figure(figsize=(5, 5))
plt.scatter(x_pca[reds, 0], x_pca[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(x_pca[blues, 0], x_pca[blues, 1], c="blue", s=20, edgecolor="k")
plt.title("Projection by PCA")
plt.xlabel("1st principal component")
plt.ylabel("2nd component")
plt.show()


from sklearn.decomposition import KernelPCA

# 1. Create a PCA object to perform the PCA transformation
#    using the RBF kernel (specified using kernel="rbf").
#    Setting fit_inverse_transform=True. This will make the object use the
#    transformed data from the first step when transforming new, unseen data points.
# 2. Let the PCA object fit and transform the data,
#    then get the transformed data back.

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
x_kpca = kpca.fit_transform(x)

plt.figure(figsize=(5, 5))
plt.scatter(x_kpca[reds, 0], x_kpca[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(x_kpca[blues, 0], x_kpca[blues, 1], c="blue", s=20, edgecolor="k")
plt.title("Projection by KPCA")
plt.xlabel(r"1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")
plt.show()


# 1. The 'kpca' variable is a KernelPCA object that is initialized with 'n_components' set to 2.
# 2. Then it applies the kernel function specified in the 'kernel' variable  and then transforms the data based on the kernel, and gets the transformed data.
# 3. Then it returns the transformed data.
# 4. Then we get the inverse transformation by simply calling "kpca.inverse_transform(x_kpca)"
# 5. Finally, we plot the transformed data.

x_back = kpca.inverse_transform(x_kpca)
plt.figure(figsize=(5, 5))
plt.scatter(x_back[reds, 0], x_back[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(x_back[blues, 0], x_back[blues, 1], c="blue", s=20, edgecolor="k")
plt.title("Original space after inverse transform")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

plt.tight_layout()
plt.show()


import sklearn.manifold

# 1. Firstly we reduce the dimensionality of the data to 6 features using PCA.
# 2. Then we take the first two PCA components and use this
#    as an initial approximation for the T-SNE algorithm.
# 3. Then we fit T-SNE on the data and plot the first two dimensions
#    of the T-SNE output, which are represented in green.
# 4. The visualization makes clear that there are distinct clusters in our data

x_reduced = PCA(n_components=6).fit_transform(x_scRNAseq)
model = sklearn.manifold.TSNE(
    n_components=2,
    init=x_reduced[:, 0:2],  # often use as a reasonable approximation
    perplexity=40,  # important parameter
    verbose=2,
    learning_rate="auto",
)

manifold = model.fit_transform(x_reduced)

plt.figure(figsize=(10, 5))
plt.scatter(manifold[:, 0], manifold[:, 1], c=y_scRNAseq, cmap="tab20", s=20)
plt.title("TSNE: scRNAseq", fontsize=25)
plt.xlabel("TSNE1", fontsize=22)
plt.ylabel("TSNE2", fontsize=22)
plt.show()

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 8617 samples in 0.012s...
[t-SNE] Computed neighbors for 8617 samples in 0.543s...
[t-SNE] Computed conditional probabilities for sample 1000 / 8617
[t-SNE] Computed conditional probabilities for sample 2000 / 8617
[t-SNE] Computed conditional probabilities for sample 3000 / 8617
[t-SNE] Computed conditional probabilities for sample 4000 / 8617
[t-SNE] Computed conditional probabilities for sample 5000 / 8617
[t-SNE] Computed conditional probabilities for sample 6000 / 8617
[t-SNE] Computed conditional probabilities for sample 7000 / 8617
[t-SNE] Computed conditional probabilities for sample 8000 / 8617
[t-SNE] Computed conditional probabilities for sample 8617 / 8617
[t-SNE] Mean sigma: 0.336887
[t-SNE] Computed conditional probabilities in 0.326s
[t-SNE] Iteration 50: error = 72.8291931, gradient norm = 0.0087157 (50 iterations in 6.128s)
[t-SNE] Iteration 100: error = 72.3467102, gradient norm = 0.0023780 (50 iterations in 4.831s)
[t-SNE] Iteration 150: error = 72.2015839, gradient norm = 0.0015488 (50 iterations in 4.784s)
[t-SNE] Iteration 200: error = 72.0850143, gradient norm = 0.0012886 (50 iterations in 5.579s)
[t-SNE] Iteration 250: error = 71.9976196, gradient norm = 0.0010311 (50 iterations in 4.534s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.997620
[t-SNE] Iteration 300: error = 2.2893496, gradient norm = 0.0108292 (50 iterations in 5.201s)
[t-SNE] Iteration 350: error = 1.9024110, gradient norm = 0.0103034 (50 iterations in 3.778s)
[t-SNE] Iteration 400: error = 1.7143979, gradient norm = 0.0090468 (50 iterations in 3.329s)
[t-SNE] Iteration 450: error = 1.6102605, gradient norm = 0.0081399 (50 iterations in 3.981s)
[t-SNE] Iteration 500: error = 1.5437438, gradient norm = 0.0075030 (50 iterations in 2.984s)
[t-SNE] Iteration 550: error = 1.4965279, gradient norm = 0.0069944 (50 iterations in 2.938s)
[t-SNE] Iteration 600: error = 1.4613835, gradient norm = 0.0066300 (50 iterations in 3.909s)
[t-SNE] Iteration 650: error = 1.4336526, gradient norm = 0.0063218 (50 iterations in 2.845s)
[t-SNE] Iteration 700: error = 1.4119039, gradient norm = 0.0059573 (50 iterations in 2.865s)
[t-SNE] Iteration 750: error = 1.3946626, gradient norm = 0.0054747 (50 iterations in 2.852s)
[t-SNE] Iteration 800: error = 1.3813536, gradient norm = 0.0047092 (50 iterations in 3.823s)
[t-SNE] Iteration 850: error = 1.3716450, gradient norm = 0.0039969 (50 iterations in 2.870s)
[t-SNE] Iteration 900: error = 1.3644423, gradient norm = 0.0033520 (50 iterations in 2.836s)
[t-SNE] Iteration 950: error = 1.3591452, gradient norm = 0.0027038 (50 iterations in 2.801s)
[t-SNE] Iteration 1000: error = 1.3549483, gradient norm = 0.0023919 (50 iterations in 3.640s)
[t-SNE] KL divergence after 1000 iterations: 1.354948


from IPython.display import clear_output

!pip install -q umap-learn
!pip install -q --upgrade tbb
clear_output()


from umap import UMAP

# Converts the original expression matrix (scRNAseq) into a 9-dimensional PCA space
x_reduced = PCA(n_components=9).fit_transform(x_scRNAseq)

# Initializes UMAP with the PCA components
model = UMAP(
    n_components=2,
    min_dist=1,
    n_neighbors=93,
    init=x_reduced[:, 0:2],
    # it is recommended to use the first two components of PCA for initialization of UMAP and t-SNE
    n_epochs=1000,
    verbose=2,
)

# Runs the UMAP algorithm on the PCA transformed data
umap = model.fit_transform(x_reduced)
clear_output()
# Plots the results of the UMAP transformation
plt.figure(figsize=(10, 5))
plt.scatter(umap[:, 0], umap[:, 1], c=y_scRNAseq, cmap="tab20", s=20)
plt.title("UMAP: scRNAseq", fontsize=25)
plt.xlabel("UMAP1", fontsize=22)
plt.ylabel("UMAP2", fontsize=22)
plt.show()


from sklearn import datasets

digits = datasets.load_digits(n_class=10)
x = digits.data
y = digits.target
n_samples, n_features = x.shape
print(x.shape)

(1797, 64)


from sklearn import manifold


# t-SNE embedding of the digits dataset
tsne = manifold.TSNE(n_components=2, init="pca", random_state=42, learning_rate="auto")
x_tsne = tsne.fit_transform(x)


umap = UMAP(n_neighbors=5)
x_umap = umap.fit_transform(x)


from matplotlib import offsetbox


def plot_embedding(x, title=None):
    x_min, x_max = np.min(x, 0), np.max(x, 0)
    x = (x - x_min) / (x_max - x_min)  # normalization of x to (0..1) range

    plt.figure()
    ax = plt.subplot(111)
    for i in range(x.shape[0]):
        plt.text(
            x[i, 0],
            x[i, 1],
            str(y[i]),
            color=plt.cm.Set1(y[i] / 10.0),
            fontdict={"weight": "bold", "size": 9},
        )

    if hasattr(offsetbox, "AnnotationBbox"):
        # only print thumbnails with matplotlib > 1.0
        shown_imgs = np.array([[1.0, 1.0]])  # just something big
        for i in range(x.shape[0]):
            dist = np.sum((x[i] - shown_imgs) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_imgs = np.r_[shown_imgs, [x[i]]]
            img_box = offsetbox.AnnotationBbox(
                offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), x[i]
            )
            ax.add_artist(img_box)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

    plt.figure()
    ax = plt.subplot(111)
    for i in range(x.shape[0]):
        plt.text(
            x[i, 0],
            x[i, 1],
            str(y[i]),
            color=plt.cm.Set1(y[i] / 10.0),
            fontdict={"weight": "bold", "size": 9},
        )
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)


# t-SNE embedding of the digits dataset
plot_embedding(x_tsne, "t-SNE embedding of the digits")


# UMAP embedding of the digits
plot_embedding(x_umap, "UMAP embedding of the digits")


import sklearn.cluster as cluster

kmeans_labels_on_raw = cluster.KMeans(n_clusters=10, n_init="auto").fit_predict(x)


from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score


def plot_clustering_metrics(true_l, pled_l, title):
    ari = adjusted_rand_score(true_l, pled_l)
    ami = adjusted_mutual_info_score(true_l, pled_l)
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    plt.title(f"Clustering metrics for {title}\n\n 1.0 is best")
    width = 0.75
    ind = np.arange(2)
    ax.barh(ind, [ari, ami], width)
    ax.grid(axis="x")
    ax.set_xlim([0, 1.0])
    for i, v in enumerate([ari, ami]):
        ax.text(v + 0.01, i, f"{v:1.2f}", color="black")
    ax.set_yticks(ind)
    ax.set_yticklabels(["ARI", "AMI"], minor=False)
    plt.show()


plot_clustering_metrics(y, kmeans_labels_on_raw, "kNN on raw dataset")


kmeans_labels_on_x_tsne = cluster.KMeans(n_clusters=10, n_init="auto").fit_predict(
    x_tsne
)
kmeans_labels_on_x_umap = cluster.KMeans(n_clusters=10, n_init="auto").fit_predict(
    x_umap
)


plot_clustering_metrics(y, kmeans_labels_on_x_tsne, "kNN on t-SNE data")


plot_clustering_metrics(y, kmeans_labels_on_x_umap, "kNN on UMAP data")

	Age	Fare
0	22.0	7.2500
1	38.0	71.2833
2	26.0	7.9250
3	35.0	53.1000
4	35.0	8.0500

	Age	SibSp	Parch	Fare
331	45.5	0	0	28.5000
733	23.0	0	0	13.0000
382	32.0	0	0	7.9250
704	26.0	1	0	7.8542
813	6.0	4	2	31.2750

category	new_feature
food	3
equipment	2
food	3
food	3
equipment	2
clothes	1

category	new_feature
food	2.3
equipment	2.04
food	2.3
clothes	2.6
equipment	2.04
clothes	2.6

category	product	price	new_feature
food	pizza	30	17
equipment	hammer	140	170
food	cucumber	4	17
clothes	boots	100	60
equipment	helmet	200	170
clothes	gloves	20	60

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	SibSp	Parch	Fare	Pclass	Sex	Embarked
118	0	1	247.5208	1	1	2
536	0	0	26.5500	1	1	0
361	1	0	27.7208	2	1	2
29	0	0	7.8958	3	1	0
55	0	0	35.5000	1	1	0

	x_1	x_2	z
0	0.261024	0.122538	0.083149
1	-0.245087	0.202270	0.100981
2	0.489471	0.882643	1.018641
3	0.368505	0.055743	0.138904
4	-0.981276	0.193832	1.000475
...	...	...	...
395	-0.617948	0.865083	1.130229
396	-0.898005	-0.210396	0.850679
397	-0.089725	0.296459	0.095939
398	0.099591	-0.320610	0.112709
399	-1.015698	-0.164124	1.058579

	Pclass	Age	SibSp	sqrt_Fare	Sex=male	Sex=female	Embarked=C	Embarked=S	...	N_cabins	Cabin_char=C	Cabin_char=n
0	3	22.0	1	2.692582	1	0	0	1	...	1	0	1
1	1	38.0	1	8.442944	0	1	1	0	...	1	1	0
2	3	26.0	0	2.815138	0	1	0	1	...	1	0	1
3	1	35.0	1	7.286975	0	1	0	1	...	1	1	0
4	3	35.0	0	2.837252	1	0	0	1	...	1	0	1

	Column	Correlation
5	Sex=male	0.541750
6	Sex=female	0.541750
0	Pclass	0.321750
19	Cabin_char=n	0.300371
4	sqrt_Fare	0.295597
10	Cabin_num	0.237024
17	Cabin_char=B	0.176650
7	Embarked=C	0.159632
16	Cabin_char=E	0.144024
8	Embarked=S	0.142371
15	Cabin_char=C	0.127315
20	Cabin_char=D	0.123186
3	Parch	0.078311
13	Cabin_char=F	0.055922
11	N_cabins	0.051495
2	SibSp	0.047602
1	Age	0.043465
12	Cabin_char=T	0.029137
9	Embarked=Q	0.006097
18	Cabin_char=A	0.005813
14	Cabin_char=G	0.005783

	Column	Rocs
6	Sex=female	0.765614
5	Sex=male	0.765614
4	sqrt_Fare	0.677138
0	Pclass	0.673802
10	Cabin_num	0.629328
19	Cabin_char=n	0.629101
8	Embarked=S	0.564660
7	Embarked=C	0.562676
3	Parch	0.558794
17	Cabin_char=B	0.540978
1	Age	0.535687
15	Cabin_char=C	0.534187
2	SibSp	0.533822
16	Cabin_char=E	0.526825
20	Cabin_char=D	0.520489
11	N_cabins	0.507450
13	Cabin_char=F	0.507429
9	Embarked=Q	0.501748
12	Cabin_char=T	0.501126
18	Cabin_char=A	0.500706
14	Cabin_char=G	0.500387

	feature_idx	cv_scores	avg_score	feature_names	ci_bound	std_dev	std_err
1	(5,)	[0.8181818181818182, 0.7482517482517482, 0.753...	0.787935	(Sex=male,)	0.040774	0.031724	0.015862
2	(2, 5)	[0.8181818181818182, 0.7622377622377622, 0.746...	0.79214	(SibSp, Sex=male)	0.043685	0.033989	0.016994
3	(0, 2, 5)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.799153	(Pclass, SibSp, Sex=male)	0.039486	0.030721	0.015361
4	(0, 2, 3, 5)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.799153	(Pclass, SibSp, Parch, Sex=male)	0.039486	0.030721	0.015361
5	(0, 2, 3, 5, 9)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.800561	(Pclass, SibSp, Parch, Sex=male, Embarked=Q)	0.040679	0.03165	0.015825
6	(0, 2, 3, 5, 9, 11)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.800561	(Pclass, SibSp, Parch, Sex=male, Embarked=Q, N...	0.040679	0.03165	0.015825
7	(0, 2, 3, 5, 9, 11, 12)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.800561	(Pclass, SibSp, Parch, Sex=male, Embarked=Q, N...	0.040679	0.03165	0.015825
8	(0, 2, 3, 5, 9, 11, 12, 13)	[0.8181818181818182, 0.7832167832167832, 0.746...	0.800561	(Pclass, SibSp, Parch, Sex=male, Embarked=Q, N...	0.040679	0.03165	0.015825

	Coef
Sex=female	1.347877
Sex=male	-1.335789
Cabin_char=E	0.852875
Cabin_char=F	0.842839
Cabin_char=G	-0.637927
Cabin_char=C	-0.557529
Pclass	-0.538057
N_cabins	-0.494651
Cabin_char=n	-0.392910
Cabin_char=D	0.378228
Cabin_char=T	-0.308945
SibSp	-0.303337
Cabin_char=A	-0.287940
Embarked=C	0.216808
Embarked=S	-0.203422
Embarked=Q	-0.131855
Cabin_char=B	0.123397
sqrt_Fare	0.110342
Parch	-0.107805
Age	-0.010998
Cabin_num	0.002733

	name	imp
6	Sex=female	0.048994
5	Sex=male	0.023240
0	Pclass	0.022514
4	sqrt_Fare	0.022291
1	Age	0.010447
3	Parch	0.010223
16	Cabin_char=E	0.009609
7	Embarked=C	0.002626
2	SibSp	0.002235
8	Embarked=S	0.001844
20	Cabin_char=D	0.001229
12	Cabin_char=T	0.000000
13	Cabin_char=F	-0.000056
14	Cabin_char=G	-0.000223
11	N_cabins	-0.000279
9	Embarked=Q	-0.000782
17	Cabin_char=B	-0.000838
15	Cabin_char=C	-0.005978
18	Cabin_char=A	-0.015698
10	Cabin_num	-0.016536
19	Cabin_char=n	-0.016927

	name	imp
5	Sex=male	0.111689
6	Sex=female	0.111689
0	Pclass	0.059201
1	Age	0.023061
4	sqrt_Fare	0.015694
20	Cabin_char=D	0.009864
2	SibSp	0.008806
16	Cabin_char=E	0.007650
13	Cabin_char=F	0.004585
8	Embarked=S	0.002909
10	Cabin_num	0.001179
19	Cabin_char=n	0.001033
3	Parch	0.000848
9	Embarked=Q	0.000727
17	Cabin_char=B	0.000485
15	Cabin_char=C	0.000005
12	Cabin_char=T	0.000000
7	Embarked=C	-0.000649
18	Cabin_char=A	-0.000721
11	N_cabins	-0.003324
14	Cabin_char=G	-0.005103

	sqrt_Fare	Sex=male	Sex=female
331	5.338539	1	0
733	3.605551	1	0
382	2.815138	1	0
704	2.802535	1	0
813	5.592406	0	1
...	...	...	...
106	2.765863	0	1
270	5.567764	1	0
860	3.756102	1	0
435	10.954451	0	1
102	8.791331	1	0

	SNP0	SNP1	SNP2	SNP3	SNP4	SNP5	SNP6	SNP7	SNP8	SNP9	...	SNP99990	SNP99991	SNP99992	SNP99993	SNP99994	SNP99995	SNP99996	SNP99997	SNP99998	SNP99999
0	0	1	0	0	0	1	0	0	0	1	...	1	1	1	0	1	0	1	0	1	1
1	0	1	1	1	1	1	0	0	0	1	...	0	1	1	0	1	1	0	1	0	1
2	1	1	1	0	0	0	1	1	0	1	...	1	1	0	1	0	1	0	1	1	1
3	0	1	1	1	1	1	1	1	1	1	...	0	0	0	1	1	1	0	1	0	1
4	1	0	1	1	0	1	1	1	0	1	...	0	0	1	1	1	0	1	0	1	1

	1	2	3	4	5	6	7	8	9	10	...	71577	71578	71579	71580	71581	71582	71583	71584	dataset	sample type
0
GSM1296956	13.374975	3.536581	13.644486	3.929925	5.485977	9.363128	13.134106	4.318162	10.050190	10.605277	...	11.111910	6.889096	10.636753	6.656603	11.054070	6.914937	8.949687	8.982860	GSE53622	cancer
GSM1296957	13.555346	4.772572	14.153843	4.388201	5.412374	9.339831	13.789576	4.211175	11.242888	10.518348	...	10.998240	8.220715	10.645032	5.799432	10.951782	5.358962	8.951818	8.147058	GSE53622	normal
GSM1296958	13.396705	4.804828	13.948490	4.395992	5.627752	7.867446	13.424588	4.097212	10.568927	10.666406	...	10.498048	8.145627	11.452488	6.164146	11.492929	6.189310	9.091511	10.021106	GSE53622	cancer
GSM1296959	13.843843	4.563550	14.390648	4.697154	5.511075	8.943584	14.181927	4.766994	10.418466	10.924152	...	10.680012	8.450327	10.966135	6.482977	10.869259	6.683605	9.321499	9.278717	GSE53622	normal
GSM1296960	13.505687	4.750858	14.049400	4.476174	5.753380	8.475744	14.255647	4.344796	10.189663	10.651861	...	9.778142	7.615217	10.570247	5.861632	11.168351	6.343246	8.793520	11.562505	GSE53622	cancer

	SNP0	SNP1	SNP2	SNP3	SNP4	SNP5	SNP6	SNP7	SNP8	SNP9	...	SNP99990	SNP99991	SNP99992	SNP99993	SNP99994	SNP99995	SNP99996	SNP99997	SNP99998	SNP99999
0	0	1	0	0	0	1	0	0	0	1	...	1	1	1	0	1	0	1	0	1	1
1	0	1	1	1	1	1	0	0	0	1	...	0	1	1	0	1	1	0	1	0	1
2	1	1	1	0	0	0	1	1	0	1	...	1	1	0	1	0	1	0	1	1	1
3	0	1	1	1	1	1	1	1	1	1	...	0	0	0	1	1	1	0	1	0	1
4	1	0	1	1	0	1	1	1	0	1	...	0	0	1	1	1	0	1	0	1	1

Генерация признаков¶

Типы признаков¶

Вещественные¶

Категориальные¶

Преобразования признаков¶

Категориальных признаков¶

Label encoding¶

One-hot encoding¶

Count encoding¶

Кодирование по вещественному признаку¶

Target encoding¶

Embedding¶

Кодирование циклических категориальных признаков¶

Вещественных признаков¶

Бинаризация¶

Округление¶

Binning (Бинирование)¶

Fixed-width binning¶

Binning by Instinct¶

Adaptive Binning¶

Логарифмирование¶

Кодирование взаимодействия признаков¶

Генерация признаков при помощи модели¶

Практический пример работы с признаками¶

sklearn.preprocessing¶

Добавление в модель признаков, полученных на основе другой модели¶

Примеры данных, которые нецелесообразно отправлять в модель в сыром виде¶

Отбор признаков¶

Зачем отбирать признаки¶

Полный перебор¶

Одномерный отбор признаков¶

Формализация задачи¶

Корреляция¶

Проблемы одномерного отбора признаков¶

Жадный отбор признаков¶

ADD-DEL¶

Отбор признаков на основе моделей¶

Randomization/Permutation¶

Пример отбора признаков с помощью permutation importance¶

Boruta¶

Пример отбора признаков с помощью Boruta¶

Dropped variable importance¶

Отбор признаков — это тоже выбор гиперпарметров¶

Пример обучения на большом числе бесполезных признаков¶

Без отбора признаков¶

С неправильной процедурой отбора признаков¶

С правильной процедурой отбора признаков¶

Задача понижения размерности¶

Manifold assumption¶

PCA (Метод главных компонент)¶

Максимизация дисперсии выборки после понижения размерности¶

Пример с Титаником¶

Без стандартизации¶

Со стандартизацией¶

Как выбирать оптимальное число компонент¶

По доле объясняемой дисперсии¶

По правилу локтя¶

Перестановочный метод¶

Вклад исходных признаков в компоненты¶

Важность стандартизации¶

Пример с RNA-Seq — нахождение выбросов¶

Пример с лицами¶

Проблемы PCA¶

Интересное направление в данных может не совпадать с направлением максимальной дисперсии.¶

Выбранные оси могут вообще не подходить для нашей задачи¶

Недостатки линейного PCA¶

Kernel PCA Ядровой (нелинейный) метод главных компонент¶

Kernel trick¶

Пример¶

Методы, основанные на сохранении расстояний¶

t-SNE (t-distributed stochastic neighbor embedding)¶

Описываем расстояния в исходном пространстве¶

Описываем расстояния в пространстве низкой размерности¶

Оптимизируем низкоразмерное представление¶

Пример применения¶

Важные параметры t-SNE¶

perplexity¶

metric¶

learning_rate¶

Проблемы t-SNE¶

	SNP0	SNP1	SNP2	SNP3	SNP4	SNP5	SNP6	SNP7	SNP8	SNP9	...	SNP99990	SNP99991	SNP99992	SNP99993	SNP99994	SNP99995	SNP99996	SNP99997	SNP99998	SNP99999
0	0	1	0	0	0	1	0	0	0	1	...	1	1	1	0	1	0	1	0	1	1
1	0	1	1	1	1	1	0	0	0	1	...	0	1	1	0	1	1	0	1	0	1
2	1	1	1	0	0	0	1	1	0	1	...	1	1	0	1	0	1	0	1	1	1
3	0	1	1	1	1	1	1	1	1	1	...	0	0	0	1	1	1	0	1	0	1
4	1	0	1	1	0	1	1	1	0	1	...	0	0	1	1	1	0	1	0	1	1