import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier


hogwarts = pd.read_csv("super_heroes_hogwarts_v3a.csv")
hogwarts.head()


hogwarts.columns.values.tolist()

['ID',
 'Name',
 'Gender',
 'Race',
 'Height',
 'Publisher',
 'Alignment',
 'Weight',
 'Manipulative',
 'Resourceful',
 'Dismissive',
 'Intelligent',
 'Trusting',
 'Loyal',
 'Stubborn',
 'Brave',
 'HouseID',
 'House',
 'STR',
 'DEX',
 'CON',
 'INT',
 'WIS',
 'CHA',
 'Level',
 'HP']


pd.DataFrame(hogwarts.columns.values, columns = ["Variables"])


hogwarts_2 = hogwarts.loc[:, "Manipulative":"House"]
hogwarts_2.head()


hogwarts_2 = hogwarts_2.drop(columns = ["HouseID"])
hogwarts_2.head()


hogwarts.loc[:, ["Manipulative", "Resourceful", "Dismissive", "Intelligent", 
                 "Trusting", "Loyal", "Stubborn", "Brave", "House"]]


import numpy as np
hogwarts.iloc[:, np.r_[8:16, 17]]


trainData, validData = train_test_split(hogwarts_2, test_size = 0.4, random_state = 666)


trainData.head()


validData.head()


predictors = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"]
outcome = "House"


scaler = preprocessing.StandardScaler()


scaler.fit(trainData[predictors])

StandardScaler()


hogwarts_2_norm = pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])
hogwarts_2_norm


# alternatively,

pd.DataFrame(scaler.transform(hogwarts_2[predictors]),
                              columns = predictors)


hogwarts_2_norm_full = pd.concat([hogwarts_2_norm, hogwarts_2["House"]], axis = 1)
hogwarts_2_norm_full


trainNorm = hogwarts_2_norm_full.iloc[trainData.index]
trainNorm.head()


validNorm = hogwarts_2_norm_full.iloc[validData.index]
validNorm.head()


train_X = trainNorm.loc[:, "Manipulative":"Brave"]
train_X


# alternatively

trainNorm.loc[:, predictors]


train_y = trainNorm["House"]
train_y

650    Gryffindor
479    Hufflepuff
271     Ravenclaw
647    Gryffindor
307     Ravenclaw
          ...    
445    Hufflepuff
414    Hufflepuff
70      Slytherin
429    Hufflepuff
236     Ravenclaw
Name: House, Length: 440, dtype: object


valid_X = validNorm.loc[:, "Manipulative":"Brave"]
valid_X


# alternatively

validNorm.loc[:, predictors]


valid_y = validNorm["House"]
valid_y

389    Hufflepuff
131     Slytherin
657    Gryffindor
421    Hufflepuff
160     Slytherin
          ...    
49      Slytherin
53      Slytherin
569    Gryffindor
700    Gryffindor
312     Ravenclaw
Name: House, Length: 294, dtype: object


from sklearn.neighbors import KNeighborsClassifier


%%capture --no-display

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(train_X, train_y)

KNeighborsClassifier(n_neighbors=3)


%%capture --no-display

train_y_pred = knn.predict(train_X)
train_y_pred[:20]

array(['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Ravenclaw',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Hufflepuff', 'Slytherin',
       'Ravenclaw', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Hufflepuff', 'Gryffindor', 'Gryffindor',
       'Gryffindor'], dtype=object)


# To remove the warning, add

# %%capture --no-display

valid_y_pred = knn.predict(valid_X)
valid_y_pred[:20]

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array(['Gryffindor', 'Ravenclaw', 'Gryffindor', 'Gryffindor', 'Slytherin',
       'Slytherin', 'Slytherin', 'Hufflepuff', 'Gryffindor', 'Hufflepuff',
       'Slytherin', 'Hufflepuff', 'Ravenclaw', 'Gryffindor', 'Hufflepuff',
       'Hufflepuff', 'Gryffindor', 'Slytherin', 'Ravenclaw', 'Slytherin'],
      dtype=object)


from sklearn.metrics import confusion_matrix, accuracy_score


confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train

array([[103,   1,   2,   4],
       [  2, 105,   4,   2],
       [  4,   7,  81,   5],
       [  1,   7,   5, 107]], dtype=int64)


from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = knn.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)


accuracy_score(train_y, train_y_pred)

0.9


from sklearn.metrics import classification_report

print(classification_report(train_y, train_y_pred))

              precision    recall  f1-score   support

  Gryffindor       0.94      0.94      0.94       110
  Hufflepuff       0.88      0.93      0.90       113
   Ravenclaw       0.88      0.84      0.86        97
   Slytherin       0.91      0.89      0.90       120

    accuracy                           0.90       440
   macro avg       0.90      0.90      0.90       440
weighted avg       0.90      0.90      0.90       440


import sklearn
training_report_1 = sklearn.metrics.classification_report(train_y, train_y_pred, output_dict=True)
training_report_1

{'Gryffindor': {'precision': 0.9363636363636364,
  'recall': 0.9363636363636364,
  'f1-score': 0.9363636363636364,
  'support': 110},
 'Hufflepuff': {'precision': 0.875,
  'recall': 0.9292035398230089,
  'f1-score': 0.9012875536480688,
  'support': 113},
 'Ravenclaw': {'precision': 0.8804347826086957,
  'recall': 0.8350515463917526,
  'f1-score': 0.8571428571428571,
  'support': 97},
 'Slytherin': {'precision': 0.9067796610169492,
  'recall': 0.8916666666666667,
  'f1-score': 0.8991596638655461,
  'support': 120},
 'accuracy': 0.9,
 'macro avg': {'precision': 0.8996445199973204,
  'recall': 0.8980713473112661,
  'f1-score': 0.8984884277550271,
  'support': 440},
 'weighted avg': {'precision': 0.9002062118979032,
  'recall': 0.9,
  'f1-score': 0.8997443417476237,
  'support': 440}}


training_report_1_df = pd.DataFrame.from_dict(training_report_1)
training_report_1_df

# to export
# training_report_1_df.to_csv("training_report_1_df.csv")


confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid

array([[63,  6,  4,  5],
       [10, 65,  0,  1],
       [ 5,  3, 48,  3],
       [ 7,  9,  4, 61]], dtype=int64)


# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = knn.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)


accuracy_score(valid_y, valid_y_pred)

0.8061224489795918


print(classification_report(valid_y, valid_y_pred))

              precision    recall  f1-score   support

  Gryffindor       0.74      0.81      0.77        78
  Hufflepuff       0.78      0.86      0.82        76
   Ravenclaw       0.86      0.81      0.83        59
   Slytherin       0.87      0.75      0.81        81

    accuracy                           0.81       294
   macro avg       0.81      0.81      0.81       294
weighted avg       0.81      0.81      0.81       294


# if not already:
# import sklearn

valid_report_1 = sklearn.metrics.classification_report(valid_y, valid_y_pred, output_dict=True)
valid_report_1

{'Gryffindor': {'precision': 0.7411764705882353,
  'recall': 0.8076923076923077,
  'f1-score': 0.7730061349693251,
  'support': 78},
 'Hufflepuff': {'precision': 0.7831325301204819,
  'recall': 0.8552631578947368,
  'f1-score': 0.8176100628930818,
  'support': 76},
 'Ravenclaw': {'precision': 0.8571428571428571,
  'recall': 0.8135593220338984,
  'f1-score': 0.8347826086956522,
  'support': 59},
 'Slytherin': {'precision': 0.8714285714285714,
  'recall': 0.7530864197530864,
  'f1-score': 0.8079470198675497,
  'support': 81},
 'accuracy': 0.8061224489795918,
 'macro avg': {'precision': 0.8132201073200365,
  'recall': 0.8074003018435074,
  'f1-score': 0.8083364566064022,
  'support': 294},
 'weighted avg': {'precision': 0.8111802035788498,
  'recall': 0.8061224489795918,
  'f1-score': 0.8065602919380835,
  'support': 294}}


valid_report_1_df = pd.DataFrame.from_dict(valid_report_1)
valid_report_1_df

# to export
# valid_report_1_df.to_csv("valid_report_1_df.csv")


data = {"Manipulative": 8, "Resourceful": 9, "Dismissive" : 8, "Intelligent": 6, "Trusting": 6, "Loyal": 8, 
        "Stubborn": 6, "Brave": 6}

new_padawan = pd.DataFrame(data, index = [0])
new_padawan


# Alternatively, use a list
pd.DataFrame([data])


new_padawan_norm = pd.DataFrame(scaler.transform(new_padawan[predictors]),
                              columns = ["Manipulative", "Resourceful", "Dismissive", "Intelligent", "Trusting", "Loyal", "Stubborn", "Brave"])

new_padawan_norm


# alternatively

pd.DataFrame(scaler.transform(new_padawan),
                              columns = predictors)


new_padawan_pred = knn.predict(new_padawan_norm)
new_padawan_pred

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array(['Slytherin'], dtype=object)


new_padawan_pred_prob = knn.predict_proba(new_padawan_norm)
new_padawan_pred_prob

array([[0.        , 0.        , 0.33333333, 0.66666667]])


# Use the classes

pd.DataFrame(new_padawan_pred_prob,
            columns = knn.classes_)


# Or specify the columns
pd.DataFrame(new_padawan_pred_prob,
            columns = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin"])


train_X2 = train_X.copy() # for consistency
valid_X2 = valid_X.copy() # for consistency
train_y2 = train_y.copy()
valid_y2 = valid_y.copy()


train_X2.head()


valid_X2.head()


train_y2.value_counts()

Slytherin     120
Hufflepuff    113
Gryffindor    110
Ravenclaw      97
Name: House, dtype: int64


valid_y2.value_counts()

Slytherin     81
Gryffindor    78
Hufflepuff    76
Ravenclaw     59
Name: House, dtype: int64


House_values = ["Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin" ]
House_2_values = ["0", "0", "0", "1"]


train_y2 = train_y.replace(House_values, House_2_values)
train_y2

650    0
479    0
271    0
647    0
307    0
      ..
445    0
414    0
70     1
429    0
236    0
Name: House, Length: 440, dtype: object


train_y2.head()

650    0
479    0
271    0
647    0
307    0
Name: House, dtype: object


train_y2.value_counts()

0    320
1    120
Name: House, dtype: int64


valid_y2 = valid_y.replace(House_values, House_2_values)
valid_y2

389    0
131    1
657    0
421    0
160    1
      ..
49     1
53     1
569    0
700    0
312    0
Name: House, Length: 294, dtype: object


valid_y2.head()

389    0
131    1
657    0
421    0
160    1
Name: House, dtype: object


valid_y2.value_counts()

0    213
1     81
Name: House, dtype: int64


knn2 = KNeighborsClassifier(n_neighbors = 7)
knn2.fit(train_X2, train_y2)
accuracy_score(train_y2, knn2.predict(train_X2))

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

0.9318181818181818


train_y2_pred = knn2.predict(train_X2)
train_y2_pred[:20]

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array(['0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1',
       '0', '0', '0', '0', '0', '0', '0'], dtype=object)


train_y2_pred_prob2 = knn2.predict_proba(train_X2)
train_y2_pred_prob2[:20]

array([[1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.71428571, 0.28571429],
       [1.        , 0.        ],
       [0.71428571, 0.28571429],
       [0.        , 1.        ],
       [0.85714286, 0.14285714],
       [0.28571429, 0.71428571],
       [1.        , 0.        ],
       [0.14285714, 0.85714286],
       [0.42857143, 0.57142857],
       [0.85714286, 0.14285714],
       [1.        , 0.        ],
       [0.57142857, 0.42857143],
       [0.85714286, 0.14285714],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.85714286, 0.14285714]])


# alternatively

pd.DataFrame(train_y2_pred_prob2, columns = knn2.classes_)


valid_y2_pred = knn2.predict(valid_X2)
valid_y2_pred[:20]

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array(['0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '1', '0', '0',
       '0', '0', '0', '0', '1', '0', '1'], dtype=object)


valid_y2_pred_prob2 = knn2.predict_proba(valid_X2)
valid_y2_pred_prob2[:6]

array([[1.        , 0.        ],
       [0.85714286, 0.14285714],
       [1.        , 0.        ],
       [0.85714286, 0.14285714],
       [0.        , 1.        ],
       [0.14285714, 0.85714286]])


# alternatively

pd.DataFrame(valid_y2_pred_prob2, columns = knn2.classes_)


confusion_matrix_train2 = confusion_matrix(train_y2, train_y2_pred)
confusion_matrix_train2

array([[309,  11],
       [ 19, 101]], dtype=int64)


#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_train2_display = ConfusionMatrixDisplay(confusion_matrix_train2, display_labels = knn2.classes_)
confusion_matrix_train2_display.plot()
plt.grid(False)


accuracy_score(train_y2, train_y2_pred)

0.9318181818181818


print(classification_report(train_y2, train_y2_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       320
           1       0.90      0.84      0.87       120

    accuracy                           0.93       440
   macro avg       0.92      0.90      0.91       440
weighted avg       0.93      0.93      0.93       440


confusion_matrix_valid2 = confusion_matrix(valid_y2, valid_y2_pred)
confusion_matrix_valid2

array([[203,  10],
       [ 15,  66]], dtype=int64)


# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid2_display = ConfusionMatrixDisplay(confusion_matrix_valid2, display_labels = knn2.classes_)
confusion_matrix_valid2_display.plot()
plt.grid(False)


accuracy_score(valid_y2, valid_y2_pred)

0.9149659863945578


print(classification_report(valid_y2, valid_y2_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       213
           1       0.87      0.81      0.84        81

    accuracy                           0.91       294
   macro avg       0.90      0.88      0.89       294
weighted avg       0.91      0.91      0.91       294


from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve


# check again

pd.DataFrame(valid_y2_pred_prob2, columns = knn2.classes_)


# Or specify the columns
pd.DataFrame(valid_y2_pred_prob2,
            columns = ["Not Slytherin", "Slytherin"])


fpr, tpr, threshold = metrics.roc_curve(valid_y2,  valid_y2_pred_prob2[:,1], pos_label = "1")


import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr, tpr, linestyle = "-", color = "green", label = "Slytherin kNN")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y2))]

p_fpr, p_tpr, _ = roc_curve(valid_y2, random_probs, pos_label = "1")

plt.plot(p_fpr, p_tpr, linestyle = "--", color = "black", label = "Random Force")

# If desired
plt.legend()

plt.title("Sorting Hat ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)

Text(0, 0.5, 'True Positive Rate')


from sklearn.metrics import roc_auc_score
auc = roc_auc_score(valid_y2, valid_y2_pred_prob2[:,1])
auc

0.9729322436677679


new_padawan_pred2 = knn2.predict(new_padawan_norm)
new_padawan_pred2

C:\Users\byeo\Anaconda3\lib\site-packages\sklearn\neighbors\_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array(['1'], dtype=object)


new_padawan_pred_prob2 = knn2.predict_proba(new_padawan_norm)
new_padawan_pred_prob2

array([[0.14285714, 0.85714286]])


pd.DataFrame(new_padawan_pred_prob2,
            columns = knn2.classes_)

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795
...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795
...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
0	1.780586	1.691429	0.494040	0.099603	0.484763	0.511163	0.499954	1.376814	Slytherin
1	0.512171	0.378084	0.064440	0.993995	0.052115	0.511163	0.071144	1.376814	Slytherin
2	0.089366	0.815866	-2.083559	0.099603	-1.245832	-1.115685	-0.357667	-1.605314	Slytherin
3	0.512171	0.378084	-2.083559	1.441191	0.484763	-0.708973	0.071144	0.098759	Slytherin
4	1.780586	-0.059698	0.923640	-1.241985	-0.813183	-0.708973	-2.072907	0.950795	Slytherin
...	...	...	...	...	...	...	...	...	...
729	-1.601855	-1.373043	-1.224360	-0.347593	0.484763	0.917875	0.928764	0.950795	Gryffindor
730	-1.179050	-1.373043	0.494040	0.546799	-0.813183	-0.708973	0.499954	0.950795	Gryffindor
731	0.089366	-0.059698	-1.653959	0.993995	0.484763	-1.115685	0.928764	0.098759	Gryffindor
732	-0.756245	0.378084	-1.653959	-0.794789	0.052115	0.511163	1.357574	1.376814	Gryffindor
733	-1.179050	0.378084	0.494040	-1.241985	0.917412	-0.708973	1.357574	0.098759	Gryffindor

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
650	-0.333440	-0.497479	-0.365160	1.441191	1.350061	-0.708973	1.357574	0.950795	Gryffindor
479	-1.179050	0.378084	-0.365160	-1.241985	0.484763	0.104451	-0.357667	0.524777	Hufflepuff
271	-1.179050	0.815866	1.353239	0.546799	-1.245832	-1.929110	-1.644097	-1.179296	Ravenclaw
647	-0.333440	-0.059698	-1.653959	0.993995	0.052115	0.917875	1.357574	0.950795	Gryffindor
307	0.089366	0.378084	1.782839	0.099603	0.484763	-1.929110	-0.786477	1.376814	Ravenclaw

	Manipulative	Resourceful	Dismissive	Intelligent	Trusting	Loyal	Stubborn	Brave	House
389	-0.756245	-1.810824	-0.794760	-0.347593	0.917412	0.104451	0.499954	0.098759	Hufflepuff
131	0.512171	0.378084	0.923640	1.441191	-1.245832	-0.708973	-0.357667	-1.179296	Slytherin
657	0.512171	-2.248606	0.923640	-0.347593	0.917412	0.104451	0.928764	0.950795	Gryffindor
421	-0.756245	1.253648	-1.653959	-0.794789	0.484763	0.917875	0.071144	1.376814	Hufflepuff
160	0.089366	1.253648	-0.365160	-1.689181	-0.813183	-0.708973	-2.072907	-0.753278	Slytherin

k Nearest Hogwarts¶

1. Load data¶

1.1 Libraries¶

1.2 Data¶

2. Data partitioning¶

2.1 Training-Validation split¶

2.2 Normalisation¶

2.3 Generate normalised training and validation sets¶

3. kNN¶

3.1 k = 3¶

3.2 Training set confusion matrix¶

3.3 Validation set confusion matrix¶

4. New padawan¶

5. Easier set up¶

5.1 Recode into 2 classes¶

5.2 kNN¶

5.3 Confusion matrix¶

5.4 ROC¶

5.5 New padawan¶

	ID	Name	Gender	Race	Height	Publisher	Alignment	Weight	Manipulative	Resourceful	...	HouseID	House	STR	DEX	CON	INT	WIS	CHA	Level	HP
0	A001	A-Bomb	Male	Human	203.0	Marvel Comics	good	441.0	10	10	...	1	Slytherin	18	11	17	12	13	11	1	7
1	A002	Abe Sapien	Male	Icthyo Sapien	191.0	Dark Horse Comics	good	65.0	7	7	...	1	Slytherin	16	17	10	13	15	11	8	72
2	A004	Abomination	Male	Human / Radiation	203.0	Marvel Comics	bad	441.0	6	8	...	1	Slytherin	13	14	13	10	18	15	15	135
3	A009	Agent 13	Female	NaN	173.0	Marvel Comics	good	61.0	7	7	...	1	Slytherin	15	18	16	16	17	10	14	140
4	A015	Alex Mercer	Male	Human	NaN	Wildstorm	bad	NaN	10	6	...	1	Slytherin	14	17	13	12	10	11	9	72

	Variables
0	ID
1	Name
2	Gender
3	Race
4	Height
5	Publisher
6	Alignment
7	Weight
8	Manipulative
9	Resourceful
10	Dismissive
11	Intelligent
12	Trusting
13	Loyal
14	Stubborn
15	Brave
16	HouseID
17	House
18	STR
19	DEX
20	CON
21	INT
22	WIS
23	CHA
24	Level
25	HP

	Gryffindor	Hufflepuff	Ravenclaw	Slytherin	accuracy	macro avg	weighted avg
precision	0.936364	0.875000	0.880435	0.906780	0.9	0.899645	0.900206
recall	0.936364	0.929204	0.835052	0.891667	0.9	0.898071	0.900000
f1-score	0.936364	0.901288	0.857143	0.899160	0.9	0.898488	0.899744
support	110.000000	113.000000	97.000000	120.000000	0.9	440.000000	440.000000

	Gryffindor	Hufflepuff	Ravenclaw	Slytherin	accuracy	macro avg	weighted avg
precision	0.741176	0.783133	0.857143	0.871429	0.806122	0.813220	0.811180
recall	0.807692	0.855263	0.813559	0.753086	0.806122	0.807400	0.806122
f1-score	0.773006	0.817610	0.834783	0.807947	0.806122	0.808336	0.806560
support	78.000000	76.000000	59.000000	81.000000	0.806122	294.000000	294.000000

	0	1
0	1.000000	0.000000
1	1.000000	0.000000
2	1.000000	0.000000
3	1.000000	0.000000
4	0.714286	0.285714
...	...	...
435	0.857143	0.142857
436	0.857143	0.142857
437	0.142857	0.857143
438	0.714286	0.285714
439	1.000000	0.000000