k Nearest Houses¶

hogwarts_py.jpeg

Download data

Back to spellbook

1. Load data¶

1.1 Libraries¶

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

1.2 Data¶

In [2]:
hogwarts = pd.read_csv("super_heroes_hogwarts_v3a.csv")
hogwarts.head()
Out[2]:
ID Name Gender Race Height Publisher Alignment Weight Manipulative Resourceful ... HouseID House STR DEX CON INT WIS CHA Level HP
0 A001 A-Bomb Male Human 203.0 Marvel Comics good 441.0 10 10 ... 1 Slytherin 18 11 17 12 13 11 1 7
1 A002 Abe Sapien Male Icthyo Sapien 191.0 Dark Horse Comics good 65.0 7 7 ... 1 Slytherin 16 17 10 13 15 11 8 72
2 A004 Abomination Male Human / Radiation 203.0 Marvel Comics bad 441.0 6 8 ... 1 Slytherin 13 14 13 10 18 15 15 135
3 A009 Agent 13 Female NaN 173.0 Marvel Comics good 61.0 7 7 ... 1 Slytherin 15 18 16 16 17 10 14 140
4 A015 Alex Mercer Male Human NaN Wildstorm bad NaN 10 6 ... 1 Slytherin 14 17 13 12 10 11 9 72

5 rows × 26 columns

In [3]:
hogwarts.columns.values.tolist()
Out[3]:
['ID',
 'Name',
 'Gender',
 'Race',
 'Height',
 'Publisher',
 'Alignment',
 'Weight',
 'Manipulative',
 'Resourceful',
 'Dismissive',
 'Intelligent',
 'Trusting',
 'Loyal',
 'Stubborn',
 'Brave',
 'HouseID',
 'House',
 'STR',
 'DEX',
 'CON',
 'INT',
 'WIS',
 'CHA',
 'Level',
 'HP']
In [4]:
pd.DataFrame(hogwarts.columns.values, columns = ["Variables"])
Out[4]:
Variables
0 ID
1 Name
2 Gender
3 Race
4 Height
5 Publisher
6 Alignment
7 Weight
8 Manipulative
9 Resourceful
10 Dismissive
11 Intelligent
12 Trusting
13 Loyal
14 Stubborn
15 Brave
16 HouseID
17 House
18 STR
19 DEX
20 CON
21 INT
22 WIS
23 CHA
24 Level
25 HP

Filter required variables.

In [5]:
hogwarts_2 = hogwarts.loc[:, "Manipulative":"House"]
hogwarts_2.head()
Out[5]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave HouseID House
0 10 10 7 6 7 7 7 9 1 Slytherin
1 7 7 6 8 6 7 6 9 1 Slytherin
2 6 8 1 6 3 3 5 2 1 Slytherin
3 7 7 1 9 7 4 6 6 1 Slytherin
4 10 6 8 3 4 4 1 8 1 Slytherin
In [6]:
hogwarts_2 = hogwarts_2.drop(columns = ["HouseID"])
hogwarts_2.head()
Out[6]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
0 10 10 7 6 7 7 7 9 Slytherin
1 7 7 6 8 6 7 6 9 Slytherin
2 6 8 1 6 3 3 5 2 Slytherin
3 7 7 1 9 7 4 6 6 Slytherin
4 10 6 8 3 4 4 1 8 Slytherin

Use loc

Or specify the columns

In [7]:
hogwarts.loc[:, ["Manipulative", "Resourceful", "Dismissive", "Intelligent", 
                 "Trusting", "Loyal", "Stubborn", "Brave", "House"]]
Out[7]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
0 10 10 7 6 7 7 7 9 Slytherin
1 7 7 6 8 6 7 6 9 Slytherin
2 6 8 1 6 3 3 5 2 Slytherin
3 7 7 1 9 7 4 6 6 Slytherin
4 10 6 8 3 4 4 1 8 Slytherin
... ... ... ... ... ... ... ... ... ...
729 2 3 3 5 7 8 8 8 Gryffindor
730 3 3 7 7 4 4 7 8 Gryffindor
731 6 6 2 8 7 3 8 6 Gryffindor
732 4 7 2 4 6 7 9 9 Gryffindor
733 3 7 7 3 8 4 9 6 Gryffindor

734 rows × 9 columns

Or use the index

In [8]:
import numpy as np
hogwarts.iloc[:, np.r_[8:16, 17]]
Out[8]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
0 10 10 7 6 7 7 7 9 Slytherin
1 7 7 6 8 6 7 6 9 Slytherin
2 6 8 1 6 3 3 5 2 Slytherin
3 7 7 1 9 7 4 6 6 Slytherin
4 10 6 8 3 4 4 1 8 Slytherin
... ... ... ... ... ... ... ... ... ...
729 2 3 3 5 7 8 8 8 Gryffindor
730 3 3 7 7 4 4 7 8 Gryffindor
731 6 6 2 8 7 3 8 6 Gryffindor
732 4 7 2 4 6 7 9 9 Gryffindor
733 3 7 7 3 8 4 9 6 Gryffindor

734 rows × 9 columns

2. Data partitioning¶

2.1 Training-Validation split¶

In [9]:
trainData, validData = train_test_split(hogwarts_2, test_size = 0.4, random_state = 666)
In [10]:
trainData.head()
Out[10]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
650 5 5 5 9 9 4 9 8 Gryffindor
479 3 7 5 3 7 6 5 7 Hufflepuff
271 3 8 9 7 3 1 2 3 Ravenclaw
647 5 6 2 8 6 8 9 8 Gryffindor
307 6 7 10 6 7 1 4 9 Ravenclaw
In [11]:
validData.head()
Out[11]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
389 4 2 4 5 8 6 7 6 Hufflepuff
131 7 7 8 9 3 4 5 3 Slytherin
657 7 1 8 5 8 6 8 8 Gryffindor
421 4 9 2 4 7 8 6 9 Hufflepuff
160 6 9 5 2 4 4 1 4 Slytherin

2.2 Normalisation¶

In [12]:
predictors = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"]
outcome = "House"

Create the normalisation model or algorithm using the training set

In [13]:
scaler = preprocessing.StandardScaler()
In [14]:
scaler.fit(trainData[predictors])
Out[14]:
StandardScaler()

Transform the whole data set.

In [15]:
hogwarts_2_norm = pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = predictors)
hogwarts_2_norm
Out[15]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
0 1.780586 1.691429 0.494040 0.099603 0.484763 0.511163 0.499954 1.376814
1 0.512171 0.378084 0.064440 0.993995 0.052115 0.511163 0.071144 1.376814
2 0.089366 0.815866 -2.083559 0.099603 -1.245832 -1.115685 -0.357667 -1.605314
3 0.512171 0.378084 -2.083559 1.441191 0.484763 -0.708973 0.071144 0.098759
4 1.780586 -0.059698 0.923640 -1.241985 -0.813183 -0.708973 -2.072907 0.950795
... ... ... ... ... ... ... ... ...
729 -1.601855 -1.373043 -1.224360 -0.347593 0.484763 0.917875 0.928764 0.950795
730 -1.179050 -1.373043 0.494040 0.546799 -0.813183 -0.708973 0.499954 0.950795
731 0.089366 -0.059698 -1.653959 0.993995 0.484763 -1.115685 0.928764 0.098759
732 -0.756245 0.378084 -1.653959 -0.794789 0.052115 0.511163 1.357574 1.376814
733 -1.179050 0.378084 0.494040 -1.241985 0.917412 -0.708973 1.357574 0.098759

734 rows × 8 columns

In [16]:
# Or spell out the names
pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])
Out[16]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
0 1.780586 1.691429 0.494040 0.099603 0.484763 0.511163 0.499954 1.376814
1 0.512171 0.378084 0.064440 0.993995 0.052115 0.511163 0.071144 1.376814
2 0.089366 0.815866 -2.083559 0.099603 -1.245832 -1.115685 -0.357667 -1.605314
3 0.512171 0.378084 -2.083559 1.441191 0.484763 -0.708973 0.071144 0.098759
4 1.780586 -0.059698 0.923640 -1.241985 -0.813183 -0.708973 -2.072907 0.950795
... ... ... ... ... ... ... ... ...
729 -1.601855 -1.373043 -1.224360 -0.347593 0.484763 0.917875 0.928764 0.950795
730 -1.179050 -1.373043 0.494040 0.546799 -0.813183 -0.708973 0.499954 0.950795
731 0.089366 -0.059698 -1.653959 0.993995 0.484763 -1.115685 0.928764 0.098759
732 -0.756245 0.378084 -1.653959 -0.794789 0.052115 0.511163 1.357574 1.376814
733 -1.179050 0.378084 0.494040 -1.241985 0.917412 -0.708973 1.357574 0.098759

734 rows × 8 columns

2.3 Generate normalised training and validation sets¶

Merge with original set to get target variable

In [17]:
hogwarts_2_norm_full = pd.concat([hogwarts_2_norm, hogwarts_2["House"]], axis = 1)
hogwarts_2_norm_full
Out[17]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
0 1.780586 1.691429 0.494040 0.099603 0.484763 0.511163 0.499954 1.376814 Slytherin
1 0.512171 0.378084 0.064440 0.993995 0.052115 0.511163 0.071144 1.376814 Slytherin
2 0.089366 0.815866 -2.083559 0.099603 -1.245832 -1.115685 -0.357667 -1.605314 Slytherin
3 0.512171 0.378084 -2.083559 1.441191 0.484763 -0.708973 0.071144 0.098759 Slytherin
4 1.780586 -0.059698 0.923640 -1.241985 -0.813183 -0.708973 -2.072907 0.950795 Slytherin
... ... ... ... ... ... ... ... ... ...
729 -1.601855 -1.373043 -1.224360 -0.347593 0.484763 0.917875 0.928764 0.950795 Gryffindor
730 -1.179050 -1.373043 0.494040 0.546799 -0.813183 -0.708973 0.499954 0.950795 Gryffindor
731 0.089366 -0.059698 -1.653959 0.993995 0.484763 -1.115685 0.928764 0.098759 Gryffindor
732 -0.756245 0.378084 -1.653959 -0.794789 0.052115 0.511163 1.357574 1.376814 Gryffindor
733 -1.179050 0.378084 0.494040 -1.241985 0.917412 -0.708973 1.357574 0.098759 Gryffindor

734 rows × 9 columns

Split the normalised set using the indices of the original split

In [18]:
trainNorm = hogwarts_2_norm_full.iloc[trainData.index]
trainNorm.head()
Out[18]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
650 -0.333440 -0.497479 -0.365160 1.441191 1.350061 -0.708973 1.357574 0.950795 Gryffindor
479 -1.179050 0.378084 -0.365160 -1.241985 0.484763 0.104451 -0.357667 0.524777 Hufflepuff
271 -1.179050 0.815866 1.353239 0.546799 -1.245832 -1.929110 -1.644097 -1.179296 Ravenclaw
647 -0.333440 -0.059698 -1.653959 0.993995 0.052115 0.917875 1.357574 0.950795 Gryffindor
307 0.089366 0.378084 1.782839 0.099603 0.484763 -1.929110 -0.786477 1.376814 Ravenclaw
In [19]:
validNorm = hogwarts_2_norm_full.iloc[validData.index]
validNorm.head()
Out[19]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave House
389 -0.756245 -1.810824 -0.794760 -0.347593 0.917412 0.104451 0.499954 0.098759 Hufflepuff
131 0.512171 0.378084 0.923640 1.441191 -1.245832 -0.708973 -0.357667 -1.179296 Slytherin
657 0.512171 -2.248606 0.923640 -0.347593 0.917412 0.104451 0.928764 0.950795 Gryffindor
421 -0.756245 1.253648 -1.653959 -0.794789 0.484763 0.917875 0.071144 1.376814 Hufflepuff
160 0.089366 1.253648 -0.365160 -1.689181 -0.813183 -0.708973 -2.072907 -0.753278 Slytherin
In [20]:
train_X = trainNorm.loc[:, "Manipulative":"Brave"]
train_X
Out[20]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
650 -0.333440 -0.497479 -0.365160 1.441191 1.350061 -0.708973 1.357574 0.950795
479 -1.179050 0.378084 -0.365160 -1.241985 0.484763 0.104451 -0.357667 0.524777
271 -1.179050 0.815866 1.353239 0.546799 -1.245832 -1.929110 -1.644097 -1.179296
647 -0.333440 -0.059698 -1.653959 0.993995 0.052115 0.917875 1.357574 0.950795
307 0.089366 0.378084 1.782839 0.099603 0.484763 -1.929110 -0.786477 1.376814
... ... ... ... ... ... ... ... ...
445 0.089366 1.253648 -1.224360 -0.794789 0.917412 0.917875 -0.357667 1.376814
414 0.934976 -0.497479 0.064440 0.099603 0.052115 0.917875 -1.215287 -1.179296
70 1.357781 0.378084 -2.083559 1.441191 -0.813183 0.917875 -0.357667 0.950795
429 1.357781 -0.059698 1.353239 -1.241985 0.484763 0.511163 0.928764 0.524777
236 -0.333440 0.378084 0.923640 0.993995 0.917412 -0.708973 -1.215287 -1.605314

440 rows × 8 columns

In [21]:
# Alternatively
trainNorm.loc[:, predictors]
Out[21]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
650 -0.333440 -0.497479 -0.365160 1.441191 1.350061 -0.708973 1.357574 0.950795
479 -1.179050 0.378084 -0.365160 -1.241985 0.484763 0.104451 -0.357667 0.524777
271 -1.179050 0.815866 1.353239 0.546799 -1.245832 -1.929110 -1.644097 -1.179296
647 -0.333440 -0.059698 -1.653959 0.993995 0.052115 0.917875 1.357574 0.950795
307 0.089366 0.378084 1.782839 0.099603 0.484763 -1.929110 -0.786477 1.376814
... ... ... ... ... ... ... ... ...
445 0.089366 1.253648 -1.224360 -0.794789 0.917412 0.917875 -0.357667 1.376814
414 0.934976 -0.497479 0.064440 0.099603 0.052115 0.917875 -1.215287 -1.179296
70 1.357781 0.378084 -2.083559 1.441191 -0.813183 0.917875 -0.357667 0.950795
429 1.357781 -0.059698 1.353239 -1.241985 0.484763 0.511163 0.928764 0.524777
236 -0.333440 0.378084 0.923640 0.993995 0.917412 -0.708973 -1.215287 -1.605314

440 rows × 8 columns

In [22]:
train_y = trainNorm["House"]
train_y
Out[22]:
650    Gryffindor
479    Hufflepuff
271     Ravenclaw
647    Gryffindor
307     Ravenclaw
          ...    
445    Hufflepuff
414    Hufflepuff
70      Slytherin
429    Hufflepuff
236     Ravenclaw
Name: House, Length: 440, dtype: object
In [23]:
valid_X = validNorm.loc[:, "Manipulative":"Brave"]
valid_X
Out[23]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
389 -0.756245 -1.810824 -0.794760 -0.347593 0.917412 0.104451 0.499954 0.098759
131 0.512171 0.378084 0.923640 1.441191 -1.245832 -0.708973 -0.357667 -1.179296
657 0.512171 -2.248606 0.923640 -0.347593 0.917412 0.104451 0.928764 0.950795
421 -0.756245 1.253648 -1.653959 -0.794789 0.484763 0.917875 0.071144 1.376814
160 0.089366 1.253648 -0.365160 -1.689181 -0.813183 -0.708973 -2.072907 -0.753278
... ... ... ... ... ... ... ... ...
49 0.089366 0.378084 -1.224360 -1.241985 0.484763 -0.302261 -1.215287 -1.605314
53 1.357781 0.815866 0.064440 -1.689181 -0.813183 -1.929110 -0.357667 -0.753278
569 0.934976 -1.373043 0.064440 -1.689181 1.350061 1.324588 0.071144 0.950795
700 -2.024660 -0.059698 -0.365160 -0.347593 1.350061 -1.522398 0.499954 1.376814
312 0.089366 -0.935261 1.353239 0.546799 -1.245832 0.104451 -0.357667 -0.753278

294 rows × 8 columns

In [24]:
# Alternatively
validNorm.loc[:, predictors]
Out[24]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
389 -0.756245 -1.810824 -0.794760 -0.347593 0.917412 0.104451 0.499954 0.098759
131 0.512171 0.378084 0.923640 1.441191 -1.245832 -0.708973 -0.357667 -1.179296
657 0.512171 -2.248606 0.923640 -0.347593 0.917412 0.104451 0.928764 0.950795
421 -0.756245 1.253648 -1.653959 -0.794789 0.484763 0.917875 0.071144 1.376814
160 0.089366 1.253648 -0.365160 -1.689181 -0.813183 -0.708973 -2.072907 -0.753278
... ... ... ... ... ... ... ... ...
49 0.089366 0.378084 -1.224360 -1.241985 0.484763 -0.302261 -1.215287 -1.605314
53 1.357781 0.815866 0.064440 -1.689181 -0.813183 -1.929110 -0.357667 -0.753278
569 0.934976 -1.373043 0.064440 -1.689181 1.350061 1.324588 0.071144 0.950795
700 -2.024660 -0.059698 -0.365160 -0.347593 1.350061 -1.522398 0.499954 1.376814
312 0.089366 -0.935261 1.353239 0.546799 -1.245832 0.104451 -0.357667 -0.753278

294 rows × 8 columns

In [25]:
valid_y = validNorm["House"]
valid_y
Out[25]:
389    Hufflepuff
131     Slytherin
657    Gryffindor
421    Hufflepuff
160     Slytherin
          ...    
49      Slytherin
53      Slytherin
569    Gryffindor
700    Gryffindor
312     Ravenclaw
Name: House, Length: 294, dtype: object

3. kNN¶

3.1 k = 3¶

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
In [27]:
%%capture --no-display
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)
accuracy_score(train_y, knn.predict(train_X))
Out[27]:
0.9

Alternatively, use a loop to find the best k.

In [28]:
%%capture --no-display
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X, train_y)
    accuracy = accuracy_score(train_y, knn_loop.predict(train_X))
    print(accuracy)

If it's easier to read...

In [29]:
%%capture --no-display
result = []
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X, train_y)
    result.append({
        "k_value": k,
        "accuracy value": accuracy_score(train_y, knn_loop.predict(train_X))
    })

result = pd.DataFrame(result)
result
Out[29]:
k_value accuracy value
0 3 0.900000
1 4 0.886364
2 5 0.897727
3 6 0.877273
4 7 0.890909
5 8 0.881818
6 9 0.884091
7 10 0.868182

Get training set predictions.

In [30]:
%%capture --no-display

train_y_pred = knn.predict(train_X)
train_y_pred[:20]
Out[30]:
array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Ravenclaw',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Slytherin',
       'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Gryffindor',
       'Gryffindor'], dtype=object)

Get validation set predictions.

In [31]:
%%capture --no-display

valid_y_pred = knn.predict(valid_X)
valid_y_pred[:20]
Out[31]:
array(['Gryffindor', 'Ravenclaw', 'Gryffindor', 'Gryffindor', 'Slytherin',
       'Slytherin', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Slytherin', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Ravenclaw', 'Slytherin'],
      dtype=object)

3.2 Training set prediction¶

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score

Training set. Confusion matrix.

In [33]:
confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train
Out[33]:
array([[103,   1,   2,   4],
       [  2, 105,   4,   2],
       [  4,   7,  81,   5],
       [  1,   7,   5, 107]], dtype=int64)

A confusion matrix that's easier to read

In [34]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = knn.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
In [35]:
accuracy_score(train_y, train_y_pred)
Out[35]:
0.9
In [36]:
from sklearn.metrics import classification_report

print(classification_report(train_y, train_y_pred))
              precision    recall  f1-score   support

  Gryffindor       0.94      0.94      0.94       110
  Hufflepuff       0.88      0.93      0.90       113
   Ravenclaw       0.88      0.84      0.86        97
   Slytherin       0.91      0.89      0.90       120

    accuracy                           0.90       440
   macro avg       0.90      0.90      0.90       440
weighted avg       0.90      0.90      0.90       440

3.3 Validation set prediction¶

Validation set. Confusion matrix.

In [37]:
confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid
Out[37]:
array([[63,  6,  4,  5],
       [10, 65,  0,  1],
       [ 5,  3, 48,  3],
       [ 7,  9,  4, 61]], dtype=int64)

A confusion matrix that's easier to read

In [38]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = knn.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
In [39]:
accuracy_score(valid_y, valid_y_pred)
Out[39]:
0.8061224489795918
In [40]:
print(classification_report(valid_y, valid_y_pred))
              precision    recall  f1-score   support

  Gryffindor       0.74      0.81      0.77        78
  Hufflepuff       0.78      0.86      0.82        76
   Ravenclaw       0.86      0.81      0.83        59
   Slytherin       0.87      0.75      0.81        81

    accuracy                           0.81       294
   macro avg       0.81      0.81      0.81       294
weighted avg       0.81      0.81      0.81       294

3. New padawan¶

Create data of lists.

In [41]:
data = {"Manipulative": 8, "Resourceful": 9, "Dismissive" : 8, "Intelligent": 6, "Trusting": 6, "Loyal": 8, 
        "Stubborn": 6, "Brave": 6}

new_padawan = pd.DataFrame(data, index = [0])
new_padawan
Out[41]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
0 8 9 8 6 6 8 6 6
In [42]:
# Alternatively, use a list
pd.DataFrame([data])
Out[42]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
0 8 9 8 6 6 8 6 6

Normalise.

In [43]:
new_padawan_norm = pd.DataFrame(scaler.transform(new_padawan[predictors]),
                              columns = predictors)

new_padawan_norm
Out[43]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
0 0.934976 1.253648 0.92364 0.099603 0.052115 0.917875 0.071144 0.098759
In [44]:
%%capture --no-display

new_padawan_pred = knn.predict(new_padawan_norm)
new_padawan_pred
Out[44]:
array(['Slytherin'], dtype=object)
In [45]:
new_padawan_pred_prob = knn.predict_proba(new_padawan_norm)
new_padawan_pred_prob
Out[45]:
array([[0.        , 0.        , 0.33333333, 0.66666667]])
In [46]:
# Use the classes

pd.DataFrame(new_padawan_pred_prob,
            columns = knn.classes_)
Out[46]:
Gryffindor Hufflepuff Ravenclaw Slytherin
0 0.0 0.0 0.333333 0.666667
In [47]:
# Or specify the columns
pd.DataFrame(new_padawan_pred_prob,
            columns = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])
Out[47]:
Gryffindor Hufflepuff Ravenclaw Slytherin
0 0.0 0.0 0.333333 0.666667

slytherin_py.png

5. Easier set up¶

5.1 Recode into 2 classes¶

In [48]:
train_X2 = train_X.copy() # for consistency
valid_X2 = valid_X.copy() # for consistency
train_y2 = train_y.copy()
valid_y2 = valid_y.copy()
In [49]:
train_X2.head()
Out[49]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
650 -0.333440 -0.497479 -0.365160 1.441191 1.350061 -0.708973 1.357574 0.950795
479 -1.179050 0.378084 -0.365160 -1.241985 0.484763 0.104451 -0.357667 0.524777
271 -1.179050 0.815866 1.353239 0.546799 -1.245832 -1.929110 -1.644097 -1.179296
647 -0.333440 -0.059698 -1.653959 0.993995 0.052115 0.917875 1.357574 0.950795
307 0.089366 0.378084 1.782839 0.099603 0.484763 -1.929110 -0.786477 1.376814
In [50]:
valid_X2.head()
Out[50]:
Manipulative Resourceful Dismissive Intelligent Trusting Loyal Stubborn Brave
389 -0.756245 -1.810824 -0.794760 -0.347593 0.917412 0.104451 0.499954 0.098759
131 0.512171 0.378084 0.923640 1.441191 -1.245832 -0.708973 -0.357667 -1.179296
657 0.512171 -2.248606 0.923640 -0.347593 0.917412 0.104451 0.928764 0.950795
421 -0.756245 1.253648 -1.653959 -0.794789 0.484763 0.917875 0.071144 1.376814
160 0.089366 1.253648 -0.365160 -1.689181 -0.813183 -0.708973 -2.072907 -0.753278
In [51]:
train_y2.value_counts()
Out[51]:
Slytherin     120
Hufflepuff    113
Gryffindor    110
Ravenclaw      97
Name: House, dtype: int64
In [52]:
valid_y2.value_counts()
Out[52]:
Slytherin     81
Gryffindor    78
Hufflepuff    76
Ravenclaw     59
Name: House, dtype: int64
In [53]:
House_values = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin" ]
House_2_values = ["Not Slytherin", "Not Slytherin", "Not Slytherin", "Slytherin"]
In [54]:
train_y2 = train_y.replace(House_values, House_2_values)
train_y2
Out[54]:
650    Not Slytherin
479    Not Slytherin
271    Not Slytherin
647    Not Slytherin
307    Not Slytherin
           ...      
445    Not Slytherin
414    Not Slytherin
70         Slytherin
429    Not Slytherin
236    Not Slytherin
Name: House, Length: 440, dtype: object
In [55]:
train_y2.head()
Out[55]:
650    Not Slytherin
479    Not Slytherin
271    Not Slytherin
647    Not Slytherin
307    Not Slytherin
Name: House, dtype: object
In [56]:
train_y2.value_counts()
Out[56]:
Not Slytherin    320
Slytherin        120
Name: House, dtype: int64
In [57]:
valid_y2 = valid_y.replace(House_values, House_2_values)
valid_y2
Out[57]:
389    Not Slytherin
131        Slytherin
657    Not Slytherin
421    Not Slytherin
160        Slytherin
           ...      
49         Slytherin
53         Slytherin
569    Not Slytherin
700    Not Slytherin
312    Not Slytherin
Name: House, Length: 294, dtype: object
In [58]:
valid_y2.head()
Out[58]:
389    Not Slytherin
131        Slytherin
657    Not Slytherin
421    Not Slytherin
160        Slytherin
Name: House, dtype: object
In [59]:
valid_y2.value_counts()
Out[59]:
Not Slytherin    213
Slytherin         81
Name: House, dtype: int64

5.2 kNN¶

In [60]:
%%capture --no-display
result_2 = []
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X2, train_y2)
    result_2.append({
        "k_value_in_knn_2": k,
        "accuracy_value": accuracy_score(train_y2, knn_loop.predict(train_X2))
    })

result_2 = pd.DataFrame(result_2)
result_2
Out[60]:
k_value_in_knn_2 accuracy_value
0 3 0.945455
1 4 0.931818
2 5 0.940909
3 6 0.925000
4 7 0.931818
5 8 0.927273
6 9 0.927273
7 10 0.925000

5.2.1 kNN k = 3¶

In [61]:
%%capture --no-display

knn2_k3 = KNeighborsClassifier(n_neighbors = 3)
knn2_k3.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k3.predict(train_X2))
Out[61]:
0.9454545454545454

Get training set prediction.

In [62]:
%%capture --no-display

train_y2_pred_k3 = knn2_k3.predict(train_X2)
train_y2_pred_k3[:20]
Out[62]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)
In [63]:
train_y2_pred_prob2_k3 = knn2_k3.predict_proba(train_X2)
train_y2_pred_prob2_k3[:20]
Out[63]:
array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333]])
In [64]:
pd.DataFrame(train_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Out[64]:
Not Slytherin Slytherin
0 1.000000 0.000000
1 1.000000 0.000000
2 1.000000 0.000000
3 1.000000 0.000000
4 1.000000 0.000000
... ... ...
435 0.666667 0.333333
436 1.000000 0.000000
437 0.000000 1.000000
438 1.000000 0.000000
439 1.000000 0.000000

440 rows × 2 columns

Get validation set prediction.

In [65]:
%%capture --no-display

valid_y2_pred_k3 = knn2_k3.predict(valid_X2)
valid_y2_pred_k3[:20]
Out[65]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'],
      dtype=object)
In [66]:
valid_y2_pred_prob2_k3 = knn2_k3.predict_proba(valid_X2)
valid_y2_pred_prob2_k3[:20]
Out[66]:
array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667]])
In [67]:
pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Out[67]:
Not Slytherin Slytherin
0 1.000000 0.000000
1 1.000000 0.000000
2 1.000000 0.000000
3 1.000000 0.000000
4 0.000000 1.000000
... ... ...
289 0.666667 0.333333
290 0.000000 1.000000
291 1.000000 0.000000
292 1.000000 0.000000
293 1.000000 0.000000

294 rows × 2 columns

5.2.2 kNN k = 5¶

In [68]:
%%capture --no-display

knn2_k5 = KNeighborsClassifier(n_neighbors = 5)
knn2_k5.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k5.predict(train_X2))
Out[68]:
0.9409090909090909

Get training set prediction.

In [69]:
%%capture --no-display

train_y2_pred_k5 = knn2_k5.predict(train_X2)
train_y2_pred_k5[:20]
Out[69]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)
In [70]:
train_y2_pred_prob2_k5 = knn2_k5.predict_proba(train_X2)
train_y2_pred_prob2_k5[:20]
Out[70]:
array([[1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0.8, 0.2],
       [0.2, 0.8],
       [1. , 0. ],
       [0. , 1. ],
       [0.4, 0.6],
       [0.8, 0.2],
       [1. , 0. ],
       [0.6, 0.4],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2]])
In [71]:
pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Out[71]:
Not Slytherin Slytherin
0 1.0 0.0
1 1.0 0.0
2 1.0 0.0
3 1.0 0.0
4 1.0 0.0
... ... ...
435 0.8 0.2
436 0.8 0.2
437 0.2 0.8
438 0.8 0.2
439 1.0 0.0

440 rows × 2 columns

Get validation set prediction.

In [72]:
%%capture --no-display

valid_y2_pred_k5 = knn2_k5.predict(valid_X2)
valid_y2_pred_k5[:20]
Out[72]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'],
      dtype=object)
In [73]:
valid_y2_pred_prob2_k5 = knn2_k5.predict_proba(valid_X2)
valid_y2_pred_prob2_k5[:20]
Out[73]:
array([[1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.2, 0.8],
       [0.2, 0.8],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.2, 0.8],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.2, 0.8],
       [0.8, 0.2],
       [0.2, 0.8]])
In [74]:
pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Out[74]:
Not Slytherin Slytherin
0 1.0 0.0
1 0.8 0.2
2 1.0 0.0
3 1.0 0.0
4 0.0 1.0
... ... ...
289 0.4 0.6
290 0.0 1.0
291 1.0 0.0
292 1.0 0.0
293 1.0 0.0

294 rows × 2 columns

5.3 Confusion matrix¶

5.3.1 Confusion matrix k = 3¶

Training set.

In [75]:
confusion_matrix_train2_k3 = confusion_matrix(train_y2, train_y2_pred_k3)
confusion_matrix_train2_k3
Out[75]:
array([[309,  11],
       [ 13, 107]], dtype=int64)
In [76]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_train2_k3_display = ConfusionMatrixDisplay(confusion_matrix_train2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_train2_k3_display.plot()
plt.grid(False)
In [77]:
accuracy_score(train_y2, train_y2_pred_k3)
Out[77]:
0.9454545454545454
In [78]:
print(classification_report(train_y2, train_y2_pred_k3))
               precision    recall  f1-score   support

Not Slytherin       0.96      0.97      0.96       320
    Slytherin       0.91      0.89      0.90       120

     accuracy                           0.95       440
    macro avg       0.93      0.93      0.93       440
 weighted avg       0.95      0.95      0.95       440

Validation set.

In [79]:
confusion_matrix_valid2_k3 = confusion_matrix(valid_y2, valid_y2_pred_k3)
confusion_matrix_valid2_k3
Out[79]:
array([[204,   9],
       [ 20,  61]], dtype=int64)
In [80]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid2_k3_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_valid2_k3_display.plot()
plt.grid(False)
In [81]:
accuracy_score(valid_y2, valid_y2_pred_k3)
Out[81]:
0.9013605442176871
In [82]:
print(classification_report(valid_y2, valid_y2_pred_k3))
               precision    recall  f1-score   support

Not Slytherin       0.91      0.96      0.93       213
    Slytherin       0.87      0.75      0.81        81

     accuracy                           0.90       294
    macro avg       0.89      0.86      0.87       294
 weighted avg       0.90      0.90      0.90       294

5.3.1 Confusion matrix k = 5¶

In [83]:
confusion_matrix_train2_k5 = confusion_matrix(train_y2, train_y2_pred_k5)
confusion_matrix_train2_k5
Out[83]:
array([[311,   9],
       [ 17, 103]], dtype=int64)
In [84]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_train2_k5_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_display.plot()
plt.grid(False)
In [85]:
accuracy_score(train_y2, train_y2_pred_k5)
Out[85]:
0.9409090909090909
In [86]:
print(classification_report(train_y2, train_y2_pred_k5))
               precision    recall  f1-score   support

Not Slytherin       0.95      0.97      0.96       320
    Slytherin       0.92      0.86      0.89       120

     accuracy                           0.94       440
    macro avg       0.93      0.92      0.92       440
 weighted avg       0.94      0.94      0.94       440

Validation set.

In [87]:
confusion_matrix_valid2_k5 = confusion_matrix(valid_y2, valid_y2_pred_k5)
confusion_matrix_valid2_k5
Out[87]:
array([[204,   9],
       [ 14,  67]], dtype=int64)
In [88]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid2_k5_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_display.plot()
plt.grid(False)
In [89]:
accuracy_score(valid_y2, valid_y2_pred_k5)
Out[89]:
0.9217687074829932
In [90]:
print(classification_report(valid_y2, valid_y2_pred_k5))
               precision    recall  f1-score   support

Not Slytherin       0.94      0.96      0.95       213
    Slytherin       0.88      0.83      0.85        81

     accuracy                           0.92       294
    macro avg       0.91      0.89      0.90       294
 weighted avg       0.92      0.92      0.92       294

Change cutoff to 0.7

In [91]:
train_y2_pred_k5
Out[91]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)
In [92]:
train_y2_pred_k5_df = pd.DataFrame(train_y2_pred_k5, columns = ["pred_50"])
train_y2_pred_k5_df
Out[92]:
pred_50
0 Not Slytherin
1 Not Slytherin
2 Not Slytherin
3 Not Slytherin
4 Not Slytherin
... ...
435 Not Slytherin
436 Not Slytherin
437 Slytherin
438 Not Slytherin
439 Not Slytherin

440 rows × 1 columns

In [93]:
train_y2_pred_prob2_k5_df = pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)
train_y2_pred_prob2_k5_df
Out[93]:
Not Slytherin Slytherin
0 1.0 0.0
1 1.0 0.0
2 1.0 0.0
3 1.0 0.0
4 1.0 0.0
... ... ...
435 0.8 0.2
436 0.8 0.2
437 0.2 0.8
438 0.8 0.2
439 1.0 0.0

440 rows × 2 columns

In [94]:
def cutoff70(prob):
    if prob < 0.7:
        return "Not Slytherin"
    elif 0.7 <= prob:
        return "Slytherin"

train_y2_pred_k5_df["pred_70"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
train_y2_pred_k5_df
Out[94]:
pred_50 pred_70
0 Not Slytherin Not Slytherin
1 Not Slytherin Not Slytherin
2 Not Slytherin Not Slytherin
3 Not Slytherin Not Slytherin
4 Not Slytherin Not Slytherin
... ... ...
435 Not Slytherin Not Slytherin
436 Not Slytherin Not Slytherin
437 Slytherin Slytherin
438 Not Slytherin Not Slytherin
439 Not Slytherin Not Slytherin

440 rows × 2 columns

In [95]:
confusion_matrix_train2_k5_70 = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_70"])
confusion_matrix_train2_k5_70
Out[95]:
array([[317,   3],
       [ 40,  80]], dtype=int64)
In [96]:
confusion_matrix_train2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_70, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_70_display.plot()
plt.grid(False)
In [97]:
valid_y2_pred_k5
Out[97]:
array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin'], dtype=object)
In [98]:
valid_y2_pred_k5_df = pd.DataFrame(valid_y2_pred_k5, columns = ["pred_50"])
valid_y2_pred_k5_df
Out[98]:
pred_50
0 Not Slytherin
1 Not Slytherin
2 Not Slytherin
3 Not Slytherin
4 Slytherin
... ...
289 Slytherin
290 Slytherin
291 Not Slytherin
292 Not Slytherin
293 Not Slytherin

294 rows × 1 columns

In [99]:
valid_y2_pred_prob2_k5_df = pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
valid_y2_pred_prob2_k5_df
Out[99]:
Not Slytherin Slytherin
0 1.0 0.0
1 0.8 0.2
2 1.0 0.0
3 1.0 0.0
4 0.0 1.0
... ... ...
289 0.4 0.6
290 0.0 1.0
291 1.0 0.0
292 1.0 0.0
293 1.0 0.0

294 rows × 2 columns

In [100]:
def cutoff70(prob):
    if prob < 0.7:
        return "Not Slytherin"
    elif 0.7 <= prob:
        return "Slytherin"

valid_y2_pred_k5_df["pred_70"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
valid_y2_pred_k5_df
Out[100]:
pred_50 pred_70
0 Not Slytherin Not Slytherin
1 Not Slytherin Not Slytherin
2 Not Slytherin Not Slytherin
3 Not Slytherin Not Slytherin
4 Slytherin Slytherin
... ... ...
289 Slytherin Not Slytherin
290 Slytherin Slytherin
291 Not Slytherin Not Slytherin
292 Not Slytherin Not Slytherin
293 Not Slytherin Not Slytherin

294 rows × 2 columns

In [101]:
confusion_matrix_valid2_k5_70 = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_70"])
confusion_matrix_valid2_k5_70
Out[101]:
array([[211,   2],
       [ 30,  51]], dtype=int64)
In [102]:
confusion_matrix_valid2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_70, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_70_display.plot()
plt.grid(False)
In [103]:
print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_70"]))
               precision    recall  f1-score   support

Not Slytherin       0.88      0.99      0.93       213
    Slytherin       0.96      0.63      0.76        81

     accuracy                           0.89       294
    macro avg       0.92      0.81      0.85       294
 weighted avg       0.90      0.89      0.88       294

5.4 ROC¶

In [104]:
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

5.4.1 k = 3¶

In [105]:
# check again

pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)
Out[105]:
Not Slytherin Slytherin
0 1.000000 0.000000
1 1.000000 0.000000
2 1.000000 0.000000
3 1.000000 0.000000
4 0.000000 1.000000
... ... ...
289 0.666667 0.333333
290 0.000000 1.000000
291 1.000000 0.000000
292 1.000000 0.000000
293 1.000000 0.000000

294 rows × 2 columns

Set positive class

In [106]:
fpr1, tpr1, threshold1 = metrics.roc_curve(valid_y2,  valid_y2_pred_prob2_k3[:,1], pos_label = "Slytherin")
In [107]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr1, p_tpr1, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr1, p_tpr1, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[107]:
Text(0, 0.5, 'True Positive Rate')
In [108]:
from sklearn.metrics import roc_auc_score
auc_score1 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k3[:,1])
auc_score1
Out[108]:
0.9335188083231901

5.4.2 k = 5¶

In [109]:
# check again

pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
Out[109]:
Not Slytherin Slytherin
0 1.0 0.0
1 0.8 0.2
2 1.0 0.0
3 1.0 0.0
4 0.0 1.0
... ... ...
289 0.4 0.6
290 0.0 1.0
291 1.0 0.0
292 1.0 0.0
293 1.0 0.0

294 rows × 2 columns

Set positive class

In [110]:
fpr2, tpr2, threshold2 = metrics.roc_curve(valid_y2,  valid_y2_pred_prob2_k5[:,1], pos_label = "Slytherin")
In [111]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr2, p_tpr2, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr2, p_tpr2, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[111]:
Text(0, 0.5, 'True Positive Rate')
In [112]:
auc_score2 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k5[:,1])
auc_score2
Out[112]:
0.966962267431751

5.4.3 Combined plot¶

In [113]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")
plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr_random, p_tpr_random, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr_random, p_tpr_random, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate");

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
In [114]:
print(auc_score1, auc_score2)
0.9335188083231901 0.966962267431751

5.5 New padawan¶

5.5.1 Predict using k = 3¶

In [115]:
%%capture --no-display

new_padawan_pred2_k3 = knn2_k3.predict(new_padawan_norm)
new_padawan_pred2_k3
Out[115]:
array(['Slytherin'], dtype=object)
In [116]:
new_padawan_pred_prob2_k3 = knn2_k3.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k3
Out[116]:
array([[0.33333333, 0.66666667]])
In [117]:
pd.DataFrame(new_padawan_pred_prob2_k3,
            columns = knn2_k3.classes_)
Out[117]:
Not Slytherin Slytherin
0 0.333333 0.666667

5.5.2 Predict using k = 5¶

In [118]:
%%capture --no-display

new_padawan_pred2_k5 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5
Out[118]:
array(['Slytherin'], dtype=object)
In [119]:
new_padawan_pred_prob2_k5 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5
Out[119]:
array([[0.2, 0.8]])
In [120]:
new_padawan_pred_prob2_k5_df = pd.DataFrame(new_padawan_pred_prob2_k5,
                                            columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df
Out[120]:
Not Slytherin Slytherin
0 0.2 0.8

Change cutoff to 0.7

In [121]:
if new_padawan_pred_prob2_k5[0][1] > 0.7:
    print("A Slytherin, you are")
else:
    print("A Slytherin, you are not")
A Slytherin, you are
In [122]:
import numpy as np
new_padawan_pred_prob2_k5_df["New_Prediction_70"] = np.where(new_padawan_pred_prob2_k5_df["Slytherin"] >= 0.7, 
                                                     "Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df.head()
Out[122]:
Not Slytherin Slytherin New_Prediction_70
0 0.2 0.8 Slytherin

Still a Slytherin, you are :-)

slytherin_py.png

5.6 Optimal Cutoff¶

Apply to k = 5 for illustration

In [123]:
# Get the optimal index position in the array
optimal_index = np.argmax(tpr1 - fpr1)
print(optimal_index)
3
In [124]:
optimal_cutoff = threshold1[optimal_index]
optimal_cutoff
Out[124]:
0.3333333333333333
In [125]:
def cutoff_optimal(prob):
    if prob < optimal_cutoff:
        return "Not Slytherin"
    elif optimal_cutoff <= prob:
        return "Slytherin"

train_y2_pred_k5_df["pred_optimal"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
train_y2_pred_k5_df
Out[125]:
pred_50 pred_70 pred_optimal
0 Not Slytherin Not Slytherin Not Slytherin
1 Not Slytherin Not Slytherin Not Slytherin
2 Not Slytherin Not Slytherin Not Slytherin
3 Not Slytherin Not Slytherin Not Slytherin
4 Not Slytherin Not Slytherin Not Slytherin
... ... ... ...
435 Not Slytherin Not Slytherin Not Slytherin
436 Not Slytherin Not Slytherin Not Slytherin
437 Slytherin Slytherin Slytherin
438 Not Slytherin Not Slytherin Not Slytherin
439 Not Slytherin Not Slytherin Not Slytherin

440 rows × 3 columns

In [126]:
confusion_matrix_train2_k5_optimal = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_optimal"])
confusion_matrix_train2_k5_optimal
Out[126]:
array([[289,  31],
       [  1, 119]], dtype=int64)
In [127]:
confusion_matrix_train2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_optimal, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_optimal_display.plot()
plt.grid(False)
In [128]:
print(classification_report(train_y2, train_y2_pred_k5_df["pred_optimal"]))
               precision    recall  f1-score   support

Not Slytherin       1.00      0.90      0.95       320
    Slytherin       0.79      0.99      0.88       120

     accuracy                           0.93       440
    macro avg       0.89      0.95      0.91       440
 weighted avg       0.94      0.93      0.93       440

In [129]:
#def cutoff_optimal(prob):
#    if prob < optimal_cutoff:
#        return "Not Slytherin"
#    elif optimal_cutoff <= prob:
#        return "Slytherin"

valid_y2_pred_k5_df["pred_optimal"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
valid_y2_pred_k5_df
Out[129]:
pred_50 pred_70 pred_optimal
0 Not Slytherin Not Slytherin Not Slytherin
1 Not Slytherin Not Slytherin Not Slytherin
2 Not Slytherin Not Slytherin Not Slytherin
3 Not Slytherin Not Slytherin Not Slytherin
4 Slytherin Slytherin Slytherin
... ... ... ...
289 Slytherin Not Slytherin Slytherin
290 Slytherin Slytherin Slytherin
291 Not Slytherin Not Slytherin Not Slytherin
292 Not Slytherin Not Slytherin Not Slytherin
293 Not Slytherin Not Slytherin Not Slytherin

294 rows × 3 columns

In [130]:
confusion_matrix_valid2_k5_optimal = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_optimal"])
confusion_matrix_valid2_k5_optimal
Out[130]:
array([[195,  18],
       [  6,  75]], dtype=int64)
In [131]:
confusion_matrix_valid2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_optimal, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_optimal_display.plot()
plt.grid(False)
In [132]:
print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_optimal"]))
               precision    recall  f1-score   support

Not Slytherin       0.97      0.92      0.94       213
    Slytherin       0.81      0.93      0.86        81

     accuracy                           0.92       294
    macro avg       0.89      0.92      0.90       294
 weighted avg       0.93      0.92      0.92       294

In [133]:
%%capture --no-display

new_padawan_pred2_k5_v2 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5_v2
Out[133]:
array(['Slytherin'], dtype=object)
In [134]:
new_padawan_pred_prob2_k5_v2 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5_v2
Out[134]:
array([[0.2, 0.8]])
In [135]:
new_padawan_pred_prob2_k5_df_2 = pd.DataFrame(new_padawan_pred_prob2_k5_v2,
                                            columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df_2
Out[135]:
Not Slytherin Slytherin
0 0.2 0.8
In [136]:
if new_padawan_pred_prob2_k5_v2[0][1] > optimal_cutoff:
    print("A Slytherin, you are")
else:
    print("A Slytherin, you are not")
A Slytherin, you are
In [137]:
import numpy as np
new_padawan_pred_prob2_k5_df_2["New_Optimal_Prediction"] = np.where(new_padawan_pred_prob2_k5_df_2["Slytherin"] >= optimal_cutoff, 
                                                     "Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df_2.head()
Out[137]:
Not Slytherin Slytherin New_Optimal_Prediction
0 0.2 0.8 Slytherin

slytherin_py.png