import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier


hogwarts = pd.read_csv("super_heroes_hogwarts_v3a.csv")
hogwarts.head()


hogwarts.columns.values.tolist()

['ID',
 'Name',
 'Gender',
 'Race',
 'Height',
 'Publisher',
 'Alignment',
 'Weight',
 'Manipulative',
 'Resourceful',
 'Dismissive',
 'Intelligent',
 'Trusting',
 'Loyal',
 'Stubborn',
 'Brave',
 'HouseID',
 'House',
 'STR',
 'DEX',
 'CON',
 'INT',
 'WIS',
 'CHA',
 'Level',
 'HP']


pd.DataFrame(hogwarts.columns.values, columns = ["Variables"])


hogwarts_2 = hogwarts.loc[:, "Manipulative":"House"]
hogwarts_2.head()


hogwarts_2 = hogwarts_2.drop(columns = ["HouseID"])
hogwarts_2.head()


hogwarts.loc[:, ["Manipulative", "Resourceful", "Dismissive", "Intelligent", 
                 "Trusting", "Loyal", "Stubborn", "Brave", "House"]]


import numpy as np
hogwarts.iloc[:, np.r_[8:16, 17]]


trainData, validData = train_test_split(hogwarts_2, test_size = 0.4, random_state = 666)


trainData.head()


validData.head()


predictors = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"]
outcome = "House"


scaler = preprocessing.StandardScaler()


scaler.fit(trainData[predictors])

StandardScaler()


hogwarts_2_norm = pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = predictors)
hogwarts_2_norm


# Or spell out the names
pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])


hogwarts_2_norm_full = pd.concat([hogwarts_2_norm, hogwarts_2["House"]], axis = 1)
hogwarts_2_norm_full


trainNorm = hogwarts_2_norm_full.iloc[trainData.index]
trainNorm.head()


validNorm = hogwarts_2_norm_full.iloc[validData.index]
validNorm.head()


train_X = trainNorm.loc[:, "Manipulative":"Brave"]
train_X


# Alternatively
trainNorm.loc[:, predictors]


train_y = trainNorm["House"]
train_y

650    Gryffindor
479    Hufflepuff
271     Ravenclaw
647    Gryffindor
307     Ravenclaw
          ...    
445    Hufflepuff
414    Hufflepuff
70      Slytherin
429    Hufflepuff
236     Ravenclaw
Name: House, Length: 440, dtype: object


valid_X = validNorm.loc[:, "Manipulative":"Brave"]
valid_X


# Alternatively
validNorm.loc[:, predictors]


valid_y = validNorm["House"]
valid_y

389    Hufflepuff
131     Slytherin
657    Gryffindor
421    Hufflepuff
160     Slytherin
          ...    
49      Slytherin
53      Slytherin
569    Gryffindor
700    Gryffindor
312     Ravenclaw
Name: House, Length: 294, dtype: object


from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


%%capture --no-display
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)
accuracy_score(train_y, knn.predict(train_X))

0.9


%%capture --no-display
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X, train_y)
    accuracy = accuracy_score(train_y, knn_loop.predict(train_X))
    print(accuracy)


%%capture --no-display
result = []
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X, train_y)
    result.append({
        "k_value": k,
        "accuracy value": accuracy_score(train_y, knn_loop.predict(train_X))
    })

result = pd.DataFrame(result)
result


%%capture --no-display

train_y_pred = knn.predict(train_X)
train_y_pred[:20]

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Ravenclaw',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Slytherin',
       'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Gryffindor',
       'Gryffindor'], dtype=object)


%%capture --no-display

valid_y_pred = knn.predict(valid_X)
valid_y_pred[:20]

array(['Gryffindor', 'Ravenclaw', 'Gryffindor', 'Gryffindor', 'Slytherin',
       'Slytherin', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Slytherin', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Ravenclaw', 'Slytherin'],
      dtype=object)


from sklearn.metrics import confusion_matrix, accuracy_score


confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train

array([[103,   1,   2,   4],
       [  2, 105,   4,   2],
       [  4,   7,  81,   5],
       [  1,   7,   5, 107]], dtype=int64)


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = knn.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)


accuracy_score(train_y, train_y_pred)

0.9


from sklearn.metrics import classification_report

print(classification_report(train_y, train_y_pred))

              precision    recall  f1-score   support

  Gryffindor       0.94      0.94      0.94       110
  Hufflepuff       0.88      0.93      0.90       113
   Ravenclaw       0.88      0.84      0.86        97
   Slytherin       0.91      0.89      0.90       120

    accuracy                           0.90       440
   macro avg       0.90      0.90      0.90       440
weighted avg       0.90      0.90      0.90       440


confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid

array([[63,  6,  4,  5],
       [10, 65,  0,  1],
       [ 5,  3, 48,  3],
       [ 7,  9,  4, 61]], dtype=int64)


# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = knn.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)


accuracy_score(valid_y, valid_y_pred)

0.8061224489795918


print(classification_report(valid_y, valid_y_pred))

              precision    recall  f1-score   support

  Gryffindor       0.74      0.81      0.77        78
  Hufflepuff       0.78      0.86      0.82        76
   Ravenclaw       0.86      0.81      0.83        59
   Slytherin       0.87      0.75      0.81        81

    accuracy                           0.81       294
   macro avg       0.81      0.81      0.81       294
weighted avg       0.81      0.81      0.81       294


data = {"Manipulative": 8, "Resourceful": 9, "Dismissive" : 8, "Intelligent": 6, "Trusting": 6, "Loyal": 8, 
        "Stubborn": 6, "Brave": 6}

new_padawan = pd.DataFrame(data, index = [0])
new_padawan


# Alternatively, use a list
pd.DataFrame([data])


new_padawan_norm = pd.DataFrame(scaler.transform(new_padawan[predictors]),
                              columns = predictors)

new_padawan_norm


%%capture --no-display

new_padawan_pred = knn.predict(new_padawan_norm)
new_padawan_pred

array(['Slytherin'], dtype=object)


new_padawan_pred_prob = knn.predict_proba(new_padawan_norm)
new_padawan_pred_prob

array([[0.        , 0.        , 0.33333333, 0.66666667]])


# Use the classes

pd.DataFrame(new_padawan_pred_prob,
            columns = knn.classes_)


# Or specify the columns
pd.DataFrame(new_padawan_pred_prob,
            columns = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])


train_X2 = train_X.copy() # for consistency
valid_X2 = valid_X.copy() # for consistency
train_y2 = train_y.copy()
valid_y2 = valid_y.copy()


train_X2.head()


valid_X2.head()


train_y2.value_counts()

Slytherin     120
Hufflepuff    113
Gryffindor    110
Ravenclaw      97
Name: House, dtype: int64


valid_y2.value_counts()

Slytherin     81
Gryffindor    78
Hufflepuff    76
Ravenclaw     59
Name: House, dtype: int64


House_values = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin" ]
House_2_values = ["Not Slytherin", "Not Slytherin", "Not Slytherin", "Slytherin"]


train_y2 = train_y.replace(House_values, House_2_values)
train_y2

650    Not Slytherin
479    Not Slytherin
271    Not Slytherin
647    Not Slytherin
307    Not Slytherin
           ...      
445    Not Slytherin
414    Not Slytherin
70         Slytherin
429    Not Slytherin
236    Not Slytherin
Name: House, Length: 440, dtype: object


train_y2.head()

650    Not Slytherin
479    Not Slytherin
271    Not Slytherin
647    Not Slytherin
307    Not Slytherin
Name: House, dtype: object


train_y2.value_counts()

Not Slytherin    320
Slytherin        120
Name: House, dtype: int64


valid_y2 = valid_y.replace(House_values, House_2_values)
valid_y2

389    Not Slytherin
131        Slytherin
657    Not Slytherin
421    Not Slytherin
160        Slytherin
           ...      
49         Slytherin
53         Slytherin
569    Not Slytherin
700    Not Slytherin
312    Not Slytherin
Name: House, Length: 294, dtype: object


valid_y2.head()

389    Not Slytherin
131        Slytherin
657    Not Slytherin
421    Not Slytherin
160        Slytherin
Name: House, dtype: object


valid_y2.value_counts()

Not Slytherin    213
Slytherin         81
Name: House, dtype: int64


%%capture --no-display
result_2 = []
for k in range(3, 11):
    knn_loop = KNeighborsClassifier(n_neighbors = k)
    knn_loop.fit(train_X2, train_y2)
    result_2.append({
        "k_value_in_knn_2": k,
        "accuracy_value": accuracy_score(train_y2, knn_loop.predict(train_X2))
    })

result_2 = pd.DataFrame(result_2)
result_2


%%capture --no-display

knn2_k3 = KNeighborsClassifier(n_neighbors = 3)
knn2_k3.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k3.predict(train_X2))

0.9454545454545454


%%capture --no-display

train_y2_pred_k3 = knn2_k3.predict(train_X2)
train_y2_pred_k3[:20]

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)


train_y2_pred_prob2_k3 = knn2_k3.predict_proba(train_X2)
train_y2_pred_prob2_k3[:20]

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.66666667, 0.33333333]])


pd.DataFrame(train_y2_pred_prob2_k3, columns = knn2_k3.classes_)


%%capture --no-display

valid_y2_pred_k3 = knn2_k3.predict(valid_X2)
valid_y2_pred_k3[:20]

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'],
      dtype=object)


valid_y2_pred_prob2_k3 = knn2_k3.predict_proba(valid_X2)
valid_y2_pred_prob2_k3[:20]

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.33333333, 0.66666667]])


pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)


%%capture --no-display

knn2_k5 = KNeighborsClassifier(n_neighbors = 5)
knn2_k5.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2_k5.predict(train_X2))

0.9409090909090909


%%capture --no-display

train_y2_pred_k5 = knn2_k5.predict(train_X2)
train_y2_pred_k5[:20]

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)


train_y2_pred_prob2_k5 = knn2_k5.predict_proba(train_X2)
train_y2_pred_prob2_k5[:20]

array([[1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0.8, 0.2],
       [0.2, 0.8],
       [1. , 0. ],
       [0. , 1. ],
       [0.4, 0.6],
       [0.8, 0.2],
       [1. , 0. ],
       [0.6, 0.4],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2]])


pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)


%%capture --no-display

valid_y2_pred_k5 = knn2_k5.predict(valid_X2)
valid_y2_pred_k5[:20]

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin'],
      dtype=object)


valid_y2_pred_prob2_k5 = knn2_k5.predict_proba(valid_X2)
valid_y2_pred_prob2_k5[:20]

array([[1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0.2, 0.8],
       [0.2, 0.8],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.2, 0.8],
       [1. , 0. ],
       [1. , 0. ],
       [0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.2, 0.8],
       [0.8, 0.2],
       [0.2, 0.8]])


pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)


confusion_matrix_train2_k3 = confusion_matrix(train_y2, train_y2_pred_k3)
confusion_matrix_train2_k3

array([[309,  11],
       [ 13, 107]], dtype=int64)


#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_train2_k3_display = ConfusionMatrixDisplay(confusion_matrix_train2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_train2_k3_display.plot()
plt.grid(False)


accuracy_score(train_y2, train_y2_pred_k3)

0.9454545454545454


print(classification_report(train_y2, train_y2_pred_k3))

               precision    recall  f1-score   support

Not Slytherin       0.96      0.97      0.96       320
    Slytherin       0.91      0.89      0.90       120

     accuracy                           0.95       440
    macro avg       0.93      0.93      0.93       440
 weighted avg       0.95      0.95      0.95       440


confusion_matrix_valid2_k3 = confusion_matrix(valid_y2, valid_y2_pred_k3)
confusion_matrix_valid2_k3

array([[204,   9],
       [ 20,  61]], dtype=int64)


#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid2_k3_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k3, display_labels = knn2_k3.classes_)
confusion_matrix_valid2_k3_display.plot()
plt.grid(False)


accuracy_score(valid_y2, valid_y2_pred_k3)

0.9013605442176871


print(classification_report(valid_y2, valid_y2_pred_k3))

               precision    recall  f1-score   support

Not Slytherin       0.91      0.96      0.93       213
    Slytherin       0.87      0.75      0.81        81

     accuracy                           0.90       294
    macro avg       0.89      0.86      0.87       294
 weighted avg       0.90      0.90      0.90       294


confusion_matrix_train2_k5 = confusion_matrix(train_y2, train_y2_pred_k5)
confusion_matrix_train2_k5

array([[311,   9],
       [ 17, 103]], dtype=int64)


#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_train2_k5_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_display.plot()
plt.grid(False)


accuracy_score(train_y2, train_y2_pred_k5)

0.9409090909090909


print(classification_report(train_y2, train_y2_pred_k5))

               precision    recall  f1-score   support

Not Slytherin       0.95      0.97      0.96       320
    Slytherin       0.92      0.86      0.89       120

     accuracy                           0.94       440
    macro avg       0.93      0.92      0.92       440
 weighted avg       0.94      0.94      0.94       440


confusion_matrix_valid2_k5 = confusion_matrix(valid_y2, valid_y2_pred_k5)
confusion_matrix_valid2_k5

array([[204,   9],
       [ 14,  67]], dtype=int64)


#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid2_k5_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5, display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_display.plot()
plt.grid(False)


accuracy_score(valid_y2, valid_y2_pred_k5)

0.9217687074829932


print(classification_report(valid_y2, valid_y2_pred_k5))

               precision    recall  f1-score   support

Not Slytherin       0.94      0.96      0.95       213
    Slytherin       0.88      0.83      0.85        81

     accuracy                           0.92       294
    macro avg       0.91      0.89      0.90       294
 weighted avg       0.92      0.92      0.92       294


train_y2_pred_k5

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin'],
      dtype=object)


train_y2_pred_k5_df = pd.DataFrame(train_y2_pred_k5, columns = ["pred_50"])
train_y2_pred_k5_df


train_y2_pred_prob2_k5_df = pd.DataFrame(train_y2_pred_prob2_k5, columns = knn2_k5.classes_)
train_y2_pred_prob2_k5_df


def cutoff70(prob):
    if prob < 0.7:
        return "Not Slytherin"
    elif 0.7 <= prob:
        return "Slytherin"

train_y2_pred_k5_df["pred_70"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
train_y2_pred_k5_df


confusion_matrix_train2_k5_70 = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_70"])
confusion_matrix_train2_k5_70

array([[317,   3],
       [ 40,  80]], dtype=int64)


confusion_matrix_train2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_70, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_70_display.plot()
plt.grid(False)


valid_y2_pred_k5

array(['Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Slytherin', 'Not Slytherin', 'Slytherin',
       'Not Slytherin', 'Not Slytherin', 'Not Slytherin', 'Not Slytherin',
       'Slytherin', 'Slytherin', 'Slytherin', 'Not Slytherin',
       'Not Slytherin', 'Not Slytherin'], dtype=object)


valid_y2_pred_k5_df = pd.DataFrame(valid_y2_pred_k5, columns = ["pred_50"])
valid_y2_pred_k5_df


valid_y2_pred_prob2_k5_df = pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)
valid_y2_pred_prob2_k5_df


def cutoff70(prob):
    if prob < 0.7:
        return "Not Slytherin"
    elif 0.7 <= prob:
        return "Slytherin"

valid_y2_pred_k5_df["pred_70"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff70)
valid_y2_pred_k5_df


confusion_matrix_valid2_k5_70 = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_70"])
confusion_matrix_valid2_k5_70

array([[211,   2],
       [ 30,  51]], dtype=int64)


confusion_matrix_valid2_k5_70_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_70, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_70_display.plot()
plt.grid(False)


print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_70"]))

               precision    recall  f1-score   support

Not Slytherin       0.88      0.99      0.93       213
    Slytherin       0.96      0.63      0.76        81

     accuracy                           0.89       294
    macro avg       0.92      0.81      0.85       294
 weighted avg       0.90      0.89      0.88       294


from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


# check again

pd.DataFrame(valid_y2_pred_prob2_k3, columns = knn2_k3.classes_)


fpr1, tpr1, threshold1 = metrics.roc_curve(valid_y2,  valid_y2_pred_prob2_k3[:,1], pos_label = "Slytherin")


import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr1, p_tpr1, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr1, p_tpr1, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)

Text(0, 0.5, 'True Positive Rate')


from sklearn.metrics import roc_auc_score
auc_score1 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k3[:,1])
auc_score1

0.9335188083231901


# check again

pd.DataFrame(valid_y2_pred_prob2_k5, columns = knn2_k5.classes_)


fpr2, tpr2, threshold2 = metrics.roc_curve(valid_y2,  valid_y2_pred_prob2_k5[:,1], pos_label = "Slytherin")


import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr2, p_tpr2, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr2, p_tpr2, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)

Text(0, 0.5, 'True Positive Rate')


auc_score2 = roc_auc_score(valid_y2, valid_y2_pred_prob2_k5[:,1])
auc_score2

0.966962267431751


import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = "-", color = "green", label = "Slytherin k = 3")
plt.plot(fpr2, tpr2, linestyle = "-", color = "blue", label = "Slytherin k = 5")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr_random, p_tpr_random, _ = roc_curve(valid_y2, random_probs, pos_label = "Slytherin")

plt.plot(p_fpr_random, p_tpr_random, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate");

# to save the plot
# plt.savefig("whatever_name",dpi = 300)


print(auc_score1, auc_score2)

0.9335188083231901 0.966962267431751


%%capture --no-display

new_padawan_pred2_k3 = knn2_k3.predict(new_padawan_norm)
new_padawan_pred2_k3

array(['Slytherin'], dtype=object)


new_padawan_pred_prob2_k3 = knn2_k3.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k3

array([[0.33333333, 0.66666667]])


pd.DataFrame(new_padawan_pred_prob2_k3,
            columns = knn2_k3.classes_)


%%capture --no-display

new_padawan_pred2_k5 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5

array(['Slytherin'], dtype=object)


new_padawan_pred_prob2_k5 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5

array([[0.2, 0.8]])


new_padawan_pred_prob2_k5_df = pd.DataFrame(new_padawan_pred_prob2_k5,
                                            columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df


if new_padawan_pred_prob2_k5[0][1] > 0.7:
    print("A Slytherin, you are")
else:
    print("A Slytherin, you are not")

A Slytherin, you are


import numpy as np
new_padawan_pred_prob2_k5_df["New_Prediction_70"] = np.where(new_padawan_pred_prob2_k5_df["Slytherin"] >= 0.7, 
                                                     "Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df.head()


# Get the optimal index position in the array
optimal_index = np.argmax(tpr1 - fpr1)
print(optimal_index)

3


optimal_cutoff = threshold1[optimal_index]
optimal_cutoff

0.3333333333333333


def cutoff_optimal(prob):
    if prob < optimal_cutoff:
        return "Not Slytherin"
    elif optimal_cutoff <= prob:
        return "Slytherin"

train_y2_pred_k5_df["pred_optimal"] = train_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
train_y2_pred_k5_df


confusion_matrix_train2_k5_optimal = confusion_matrix(train_y2, train_y2_pred_k5_df["pred_optimal"])
confusion_matrix_train2_k5_optimal

array([[289,  31],
       [  1, 119]], dtype=int64)


confusion_matrix_train2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_train2_k5_optimal, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_train2_k5_optimal_display.plot()
plt.grid(False)


print(classification_report(train_y2, train_y2_pred_k5_df["pred_optimal"]))

               precision    recall  f1-score   support

Not Slytherin       1.00      0.90      0.95       320
    Slytherin       0.79      0.99      0.88       120

     accuracy                           0.93       440
    macro avg       0.89      0.95      0.91       440
 weighted avg       0.94      0.93      0.93       440


#def cutoff_optimal(prob):
#    if prob < optimal_cutoff:
#        return "Not Slytherin"
#    elif optimal_cutoff <= prob:
#        return "Slytherin"

valid_y2_pred_k5_df["pred_optimal"] = valid_y2_pred_prob2_k5_df["Slytherin"].apply(cutoff_optimal)
valid_y2_pred_k5_df


confusion_matrix_valid2_k5_optimal = confusion_matrix(valid_y2, valid_y2_pred_k5_df["pred_optimal"])
confusion_matrix_valid2_k5_optimal

array([[195,  18],
       [  6,  75]], dtype=int64)


confusion_matrix_valid2_k5_optimal_display = ConfusionMatrixDisplay(confusion_matrix_valid2_k5_optimal, 
                                                               display_labels = knn2_k5.classes_)
confusion_matrix_valid2_k5_optimal_display.plot()
plt.grid(False)


print(classification_report(valid_y2, valid_y2_pred_k5_df["pred_optimal"]))

               precision    recall  f1-score   support

Not Slytherin       0.97      0.92      0.94       213
    Slytherin       0.81      0.93      0.86        81

     accuracy                           0.92       294
    macro avg       0.89      0.92      0.90       294
 weighted avg       0.93      0.92      0.92       294


%%capture --no-display

new_padawan_pred2_k5_v2 = knn2_k5.predict(new_padawan_norm)
new_padawan_pred2_k5_v2

array(['Slytherin'], dtype=object)


new_padawan_pred_prob2_k5_v2 = knn2_k5.predict_proba(new_padawan_norm)
new_padawan_pred_prob2_k5_v2

array([[0.2, 0.8]])


new_padawan_pred_prob2_k5_df_2 = pd.DataFrame(new_padawan_pred_prob2_k5_v2,
                                            columns = knn2_k5.classes_)
new_padawan_pred_prob2_k5_df_2


if new_padawan_pred_prob2_k5_v2[0][1] > optimal_cutoff:
    print("A Slytherin, you are")
else:
    print("A Slytherin, you are not")

A Slytherin, you are


import numpy as np
new_padawan_pred_prob2_k5_df_2["New_Optimal_Prediction"] = np.where(new_padawan_pred_prob2_k5_df_2["Slytherin"] >= optimal_cutoff, 
                                                     "Slytherin", "Not Slytherin")
new_padawan_pred_prob2_k5_df_2.head()

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795
...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795
...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814	Slytherin
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814	Slytherin
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314	Slytherin
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759	Slytherin
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795	Slytherin
...	...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795	Gryffindor
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795	Gryffindor
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759	Gryffindor
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814	Gryffindor
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759	Gryffindor

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
650	-0.333440	-0.497479	-0.365160	1.441191	1.350061	-0.708973	1.357574	0.950795	Gryffindor
479	-1.179050	0.378084	-0.365160	-1.241985	0.484763	0.104451	-0.357667	0.524777	Hufflepuff
271	-1.179050	0.815866	1.353239	0.546799	-1.245832	-1.929110	-1.644097	-1.179296	Ravenclaw
647	-0.333440	-0.059698	-1.653959	0.993995	0.052115	0.917875	1.357574	0.950795	Gryffindor
307	0.089366	0.378084	1.782839	0.099603	0.484763	-1.929110	-0.786477	1.376814	Ravenclaw

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
389	-0.756245	-1.810824	-0.794760	-0.347593	0.917412	0.104451	0.499954	0.098759	Hufflepuff
131	0.512171	0.378084	0.923640	1.441191	-1.245832	-0.708973	-0.357667	-1.179296	Slytherin
657	0.512171	-2.248606	0.923640	-0.347593	0.917412	0.104451	0.928764	0.950795	Gryffindor
421	-0.756245	1.253648	-1.653959	-0.794789	0.484763	0.917875	0.071144	1.376814	Hufflepuff
160	0.089366	1.253648	-0.365160	-1.689181	-0.813183	-0.708973	-2.072907	-0.753278	Slytherin

k Nearest Houses¶

1. Load data¶

1.1 Libraries¶

1.2 Data¶

2. Data partitioning¶

2.1 Training-Validation split¶

2.2 Normalisation¶

2.3 Generate normalised training and validation sets¶

3. kNN¶

3.1 k = 3¶

3.2 Training set prediction¶

3.3 Validation set prediction¶

3. New padawan¶

5. Easier set up¶

5.1 Recode into 2 classes¶

5.2 kNN¶

5.2.1 kNN k = 3¶

5.2.2 kNN k = 5¶

5.3 Confusion matrix¶

5.3.1 Confusion matrix k = 3¶

5.3.1 Confusion matrix k = 5¶

5.4 ROC¶

5.4.1 k = 3¶

5.4.2 k = 5¶

5.4.3 Combined plot¶

5.5 New padawan¶

5.5.1 Predict using k = 3¶

5.5.2 Predict using k = 5¶

5.6 Optimal Cutoff¶

	ID	Name	Gender	Race	Height	Publisher	Alignment	Weight	Manipulative	Resourceful	...	HouseID	House	STR	DEX	CON	INT	WIS	CHA	Level	HP
0	A001	A-Bomb	Male	Human	203.0	Marvel Comics	good	441.0	10	10	...	1	Slytherin	18	11	17	12	13	11	1	7
1	A002	Abe Sapien	Male	Icthyo Sapien	191.0	Dark Horse Comics	good	65.0	7	7	...	1	Slytherin	16	17	10	13	15	11	8	72
2	A004	Abomination	Male	Human / Radiation	203.0	Marvel Comics	bad	441.0	6	8	...	1	Slytherin	13	14	13	10	18	15	15	135
3	A009	Agent 13	Female	NaN	173.0	Marvel Comics	good	61.0	7	7	...	1	Slytherin	15	18	16	16	17	10	14	140
4	A015	Alex Mercer	Male	Human	NaN	Wildstorm	bad	NaN	10	6	...	1	Slytherin	14	17	13	12	10	11	9	72

	Variables
0	ID
1	Name
2	Gender
3	Race
4	Height
5	Publisher
6	Alignment
7	Weight
8	Manipulative
9	Resourceful
10	Dismissive
11	Intelligent
12	Trusting
13	Loyal
14	Stubborn
15	Brave
16	HouseID
17	House
18	STR
19	DEX
20	CON
21	INT
22	WIS
23	CHA
24	Level
25	HP

	k_value	accuracy value
0	3	0.900000
1	4	0.886364
2	5	0.897727
3	6	0.877273
4	7	0.890909
5	8	0.881818
6	9	0.884091
7	10	0.868182

	k_value_in_knn_2	accuracy_value
0	3	0.945455
1	4	0.931818
2	5	0.940909
3	6	0.925000
4	7	0.931818
5	8	0.927273
6	9	0.927273
7	10	0.925000

	Not Slytherin	Slytherin
0	1.000000	0.000000
1	1.000000	0.000000
2	1.000000	0.000000
3	1.000000	0.000000
4	1.000000	0.000000
...	...	...
435	0.666667	0.333333
436	1.000000	0.000000
437	0.000000	1.000000
438	1.000000	0.000000
439	1.000000	0.000000

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	pred_50
0	Not Slytherin
1	Not Slytherin
2	Not Slytherin
3	Not Slytherin
4	Not Slytherin
...	...
435	Not Slytherin
436	Not Slytherin
437	Slytherin
438	Not Slytherin
439	Not Slytherin

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	1.0	0.0
2	1.0	0.0
3	1.0	0.0
4	1.0	0.0
...	...	...
435	0.8	0.2
436	0.8	0.2
437	0.2	0.8
438	0.8	0.2
439	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0

	Not Slytherin	Slytherin
0	1.0	0.0
1	0.8	0.2
2	1.0	0.0
3	1.0	0.0
4	0.0	1.0
...	...	...
289	0.4	0.6
290	0.0	1.0
291	1.0	0.0
292	1.0	0.0
293	1.0	0.0