If I'm Being Honest...¶

Data for demo

Back to Spell Book

1. Load data¶

1.1 Libraries¶

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

1.2 Load data¶

Sometimes we need to specify the encoding encoding = "ISO-8859-1" encoding = "utf-8"

In [2]:
songs_df = pd.read_csv("american_idol_songs_v8.csv", encoding = "ISO-8859-1")

songs_df.head()
Out[2]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating
0 1 Stuff Like That There Bette Midler 95.0 1991 11.0 1 0 0 20.5 55.5
1 2 In A Dream Badlands 94.0 1991 14.0 1 0 0 24.4 94.0
2 3 Build Me Up Buttercup The Foundations 93.0 1969 34.0 1 0 0 26.2 93.0
3 4 Hemorrhage (In My Hands) Fuel 92.0 2000 6.0 1 0 0 29.4 92.0
4 5 Solitaire Carpenters 92.0 1974 29.0 1 0 0 25.2 68.5

1.3 Prepare data¶

In [3]:
songs_df.dtypes
Out[3]:
No                  int64
Song_Title         object
Artiste            object
Song_Avg_Rtg      float64
Year                int64
Avg_Song_Age      float64
Advance             int64
Bottom              int64
Elimination         int64
Expectation       float64
Artiste_Rating    float64
dtype: object

Set target variable as category

In [4]:
songs_df.Advance = songs_df.Advance.astype("category")

Filter the variables for the spell

In [5]:
pd.DataFrame(songs_df.columns.values, columns = ["Variables"])
Out[5]:
Variables
0 No
1 Song_Title
2 Artiste
3 Song_Avg_Rtg
4 Year
5 Avg_Song_Age
6 Advance
7 Bottom
8 Elimination
9 Expectation
10 Artiste_Rating

Filter data for spell

In [6]:
songs_df_filtered = songs_df.iloc[:, [3, 5, 9, 10, 6]]
songs_df_filtered.head()
Out[6]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating Advance
0 95.0 11.0 20.5 55.5 1
1 94.0 14.0 24.4 94.0 1
2 93.0 34.0 26.2 93.0 1
3 92.0 6.0 29.4 92.0 1
4 92.0 29.0 25.2 68.5 1
In [7]:
songs_df_filtered.dtypes
Out[7]:
Song_Avg_Rtg       float64
Avg_Song_Age       float64
Expectation        float64
Artiste_Rating     float64
Advance           category
dtype: object
In [8]:
songs_df_filtered["Advance"].value_counts()
Out[8]:
1    1339
0     287
Name: Advance, dtype: int64
In [9]:
songs_df_filtered["Advance"].value_counts()[1]
Out[9]:
1339

1.4 Explore data¶

In [10]:
import plotly.express as px
fig_1 = px.scatter_3d(songs_df_filtered,
                      x = "Song_Avg_Rtg",
                      y = "Avg_Song_Age",
                      z = "Artiste_Rating",
                      color = "Advance",
                      symbol = "Advance",
                      category_orders = {"Advance": [1,0]},
                      labels = {"Song_Avg_Rtg": "Average Song Rating",
                                "Avg_Song_Age": "Average Song Age",
                                "Artiste_Rating": "Artiste Rating"},
                      title = "Characteristics of Songs that Advance")
fig_1.show()

2. Training-Validation Split¶

In [11]:
X = songs_df_filtered.drop(columns = ["Advance"])
y = songs_df_filtered["Advance"].astype("category")
In [12]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.3, random_state = 666)
In [13]:
train_X.head()
Out[13]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
699 50.0 43.0 13.000 50.0
916 42.0 29.0 -14.800 42.0
1343 16.0 20.5 -0.637 14.7
868 44.0 46.0 -11.700 54.0
645 52.5 4.3 2.800 52.5
In [14]:
train_y.head()
Out[14]:
699     1
916     1
1343    0
868     1
645     1
Name: Advance, dtype: category
Categories (2, int64): [0, 1]
In [15]:
valid_X.head()
Out[15]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
67 82.0 51.0 18.3 82.0
348 65.0 2.0 29.5 48.0
1396 10.0 5.0 -32.9 10.0
356 65.0 23.0 4.5 74.8
768 47.5 26.0 -14.7 64.9
In [16]:
valid_y.head()
Out[16]:
67      1
348     1
1396    0
356     1
768     0
Name: Advance, dtype: category
Categories (2, int64): [0, 1]
In [17]:
len(train_X)
Out[17]:
1138
In [18]:
len(train_y)
Out[18]:
1138
In [19]:
len(valid_X)
Out[19]:
488
In [20]:
len(valid_y)
Out[20]:
488

3. Logistic Regression¶

In [21]:
logreg = LogisticRegression()

# fit the model with data
logreg_model = logreg.fit(train_X, train_y)
In [22]:
print("The intercept is", logreg_model.intercept_)
print("The coefficients are", logreg_model.coef_)

coef_df = pd.DataFrame({"Coefficient": logreg_model.coef_[0]}, index = train_X.columns)
coef_df
The intercept is [-0.55674839]
The coefficients are [[ 0.0432835   0.0155768   0.00112524 -0.00348794]]
Out[22]:
Coefficient
Song_Avg_Rtg 0.043284
Avg_Song_Age 0.015577
Expectation 0.001125
Artiste_Rating -0.003488

3.1 Predictions¶

In [23]:
train_y_pred = logreg_model.predict(train_X)
train_y_pred[0:9]
Out[23]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
In [24]:
valid_y_pred = logreg_model.predict(valid_X)
valid_y_pred[0:9]
Out[24]:
array([1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

3.2 Probabilities¶

In [25]:
train_y_pred_prob = logreg_model.predict_proba(train_X)
train_y_pred_prob[0:9]
Out[25]:
array([[0.10741132, 0.89258868],
       [0.17512397, 0.82487603],
       [0.40056235, 0.59943765],
       [0.13438348, 0.86561652],
       [0.1676116 , 0.8323884 ],
       [0.10212483, 0.89787517],
       [0.09516593, 0.90483407],
       [0.15540659, 0.84459341],
       [0.17910164, 0.82089836]])
In [26]:
valid_y_pred_prob = logreg_model.predict_proba(valid_X)
valid_y_pred_prob[0:9]
Out[26]:
array([[0.02870674, 0.97129326],
       [0.10399469, 0.89600531],
       [0.52944816, 0.47055184],
       [0.08634425, 0.91365575],
       [0.15958667, 0.84041333],
       [0.30933336, 0.69066664],
       [0.12642922, 0.87357078],
       [0.04267718, 0.95732282],
       [0.09931677, 0.90068323]])

4. Model Evaluation¶

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_train = confusion_matrix(train_y, train_y_pred)
confusion_matrix_train
Out[27]:
array([[ 16, 184],
       [  4, 934]], dtype=int64)
In [28]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix_train_display = ConfusionMatrixDisplay(confusion_matrix_train, display_labels = logreg_model.classes_)
confusion_matrix_train_display.plot()
plt.grid(False)
In [29]:
accuracy_train = accuracy_score(train_y, train_y_pred)
accuracy_train
Out[29]:
0.8347978910369068
In [30]:
from sklearn.metrics import classification_report

print(classification_report(train_y, train_y_pred))
              precision    recall  f1-score   support

           0       0.80      0.08      0.15       200
           1       0.84      1.00      0.91       938

    accuracy                           0.83      1138
   macro avg       0.82      0.54      0.53      1138
weighted avg       0.83      0.83      0.77      1138

In [31]:
# from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_valid = confusion_matrix(valid_y, valid_y_pred)
confusion_matrix_valid
Out[31]:
array([[  8,  79],
       [  5, 396]], dtype=int64)
In [32]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display = ConfusionMatrixDisplay(confusion_matrix_valid, display_labels = logreg_model.classes_)
confusion_matrix_valid_display.plot()
plt.grid(False)
In [33]:
# from sklearn.metrics import classification_report

print(classification_report(valid_y, valid_y_pred))
              precision    recall  f1-score   support

           0       0.62      0.09      0.16        87
           1       0.83      0.99      0.90       401

    accuracy                           0.83       488
   macro avg       0.72      0.54      0.53       488
weighted avg       0.79      0.83      0.77       488

In [34]:
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
In [35]:
fpr1, tpr1, thresh1 = roc_curve(valid_y, valid_y_pred_prob[:,1], pos_label = 1)
In [36]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = '-', color = "red", label = "Songs to Advance")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y))]

p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)

plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")

# If desired
plt.legend()

plt.title("Songs to Advance ROC")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[36]:
Text(0, 0.5, 'True Positive Rate')
In [37]:
from sklearn.metrics import roc_auc_score
In [38]:
auc = roc_auc_score(valid_y, valid_y_pred_prob[:,1])
auc
Out[38]:
0.7319918594318802

5. New Songs¶

New songs

In [39]:
new_songs_df = pd.read_csv("new_songs_v2.csv", encoding = "ISO-8859-1")

new_songs_df
Out[39]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band
In [40]:
new_songs_df_filtered = new_songs_df.iloc[:, [3, 5, 9, 10]]
new_songs_df_filtered
Out[40]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
0 96 19 42 90
1 90 1 36 90
2 6 20 2 100
In [41]:
new_songs_prediction = logreg_model.predict(new_songs_df_filtered)
new_songs_prediction
Out[41]:
array([1, 1, 0], dtype=int64)
In [42]:
new_songs_prediction_prob = logreg_model.predict_proba(new_songs_df_filtered)
new_songs_prediction_prob
Out[42]:
array([[0.02588734, 0.97411266],
       [0.04390043, 0.95609957],
       [0.58225918, 0.41774082]])
In [43]:
new_songs_prediction_df = pd.DataFrame(new_songs_prediction, columns = ["Prediction"])
new_songs_prediction_df
Out[43]:
Prediction
0 1
1 1
2 0
In [44]:
pd.concat((new_songs_df, new_songs_prediction_df), axis = 1)
Out[44]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments Prediction
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band 1
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band 1
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band 0

6. Multicollinearity¶

In [45]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df = pd.DataFrame()

vif_df["features"] = train_X.columns
vif_df["VIF"] = [variance_inflation_factor(train_X.values, i) for i in range(train_X.shape[1])]

print(vif_df)
         features        VIF
0    Song_Avg_Rtg  16.706265
1    Avg_Song_Age   2.483684
2     Expectation   1.350685
3  Artiste_Rating  16.696281
In [46]:
train_X.corr()
Out[46]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
Song_Avg_Rtg 1.000000 0.106034 0.710542 0.598677
Avg_Song_Age 0.106034 1.000000 0.053974 0.176771
Expectation 0.710542 0.053974 1.000000 0.400120
Artiste_Rating 0.598677 0.176771 0.400120 1.000000

6.1. Modified Logistic Regression Model¶

In [47]:
train_X_1a = train_X.drop(columns = ["Artiste_Rating"])
train_X_1a.head()
Out[47]:
Song_Avg_Rtg Avg_Song_Age Expectation
699 50.0 43.0 13.000
916 42.0 29.0 -14.800
1343 16.0 20.5 -0.637
868 44.0 46.0 -11.700
645 52.5 4.3 2.800
In [48]:
train_y_1a = train_y.copy()
train_y_1a.head()
Out[48]:
699     1
916     1
1343    0
868     1
645     1
Name: Advance, dtype: category
Categories (2, int64): [0, 1]
In [49]:
valid_X_1a = valid_X.drop(columns = ["Artiste_Rating"])
valid_X_1a.head()
Out[49]:
Song_Avg_Rtg Avg_Song_Age Expectation
67 82.0 51.0 18.3
348 65.0 2.0 29.5
1396 10.0 5.0 -32.9
356 65.0 23.0 4.5
768 47.5 26.0 -14.7
In [50]:
valid_y_1a = valid_y.copy()
valid_y_1a.head()
Out[50]:
67      1
348     1
1396    0
356     1
768     0
Name: Advance, dtype: category
Categories (2, int64): [0, 1]
In [51]:
logreg = LogisticRegression()

# fit the model with data
logreg_model_1a = logreg.fit(train_X_1a, train_y_1a)
In [52]:
print("The intercept is", logreg_model_1a.intercept_)
print("The coefficients are", logreg_model_1a.coef_)

coef_df_1a = pd.DataFrame({"Coefficient": logreg_model_1a.coef_[0]}, index = train_X_1a.columns)
coef_df_1a
The intercept is [-0.64169234]
The coefficients are [[0.04145236 0.01510463 0.00136229]]
Out[52]:
Coefficient
Song_Avg_Rtg 0.041452
Avg_Song_Age 0.015105
Expectation 0.001362

6.2 Predictions (Modified Model)¶

In [53]:
train_y_1a_pred = logreg_model_1a.predict(train_X_1a)
train_y_1a_pred[0:9]
Out[53]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)
In [54]:
valid_y_1a_pred = logreg_model_1a.predict(valid_X_1a)
valid_y_1a_pred[0:9]
Out[54]:
array([1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int64)

6.3 Probabilities (Modified Model)¶

In [55]:
train_y_pred_1a_prob = logreg_model_1a.predict_proba(train_X_1a)
train_y_pred_1a_prob[0:9]
Out[55]:
array([[0.10927828, 0.89072172],
       [0.17987727, 0.82012273],
       [0.41816214, 0.58183786],
       [0.13457655, 0.86542345],
       [0.16751801, 0.83248199],
       [0.09937026, 0.90062974],
       [0.10617309, 0.89382691],
       [0.15548014, 0.84451986],
       [0.1807682 , 0.8192318 ]])
In [56]:
valid_y_pred_1a_prob = logreg_model_1a.predict_proba(valid_X_1a)
valid_y_pred_1a_prob[0:9]
Out[56]:
array([[0.02785027, 0.97214973],
       [0.10687109, 0.89312891],
       [0.54895882, 0.45104118],
       [0.08269752, 0.91730248],
       [0.15446664, 0.84553336],
       [0.30828082, 0.69171918],
       [0.12509386, 0.87490614],
       [0.03705495, 0.96294505],
       [0.10126542, 0.89873458]])

6.4. Model Evaluation (Modified Model)¶

In [57]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_df_1a = pd.DataFrame()

vif_df_1a["features"] = train_X_1a.columns
vif_df_1a["VIF"] = [variance_inflation_factor(train_X_1a.values, i) for i in range(train_X_1a.shape[1])]

print(vif_df_1a)
       features       VIF
0  Song_Avg_Rtg  2.484737
1  Avg_Song_Age  2.364796
2   Expectation  1.109415
In [58]:
from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_train_1a = confusion_matrix(train_y_1a, train_y_1a_pred)
confusion_matrix_train_1a
Out[58]:
array([[ 14, 186],
       [  8, 930]], dtype=int64)
In [59]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix_train_1a_display = ConfusionMatrixDisplay(confusion_matrix_train_1a, display_labels = logreg_model_1a.classes_)
confusion_matrix_train_1a_display.plot()
plt.grid(False)
In [60]:
accuracy_train_1a = accuracy_score(train_y_1a, train_y_1a_pred)
accuracy_train_1a
Out[60]:
0.8295254833040422
In [61]:
from sklearn.metrics import classification_report

print(classification_report(train_y_1a, train_y_1a_pred))
              precision    recall  f1-score   support

           0       0.64      0.07      0.13       200
           1       0.83      0.99      0.91       938

    accuracy                           0.83      1138
   macro avg       0.73      0.53      0.52      1138
weighted avg       0.80      0.83      0.77      1138

In [62]:
# from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_valid_1a = confusion_matrix(valid_y_1a, valid_y_1a_pred)
confusion_matrix_valid_1a
Out[62]:
array([[  8,  79],
       [  5, 396]], dtype=int64)
In [63]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_1a_display = ConfusionMatrixDisplay(confusion_matrix_valid_1a, display_labels = logreg_model_1a.classes_)
confusion_matrix_valid_1a_display.plot()
plt.grid(False)
In [64]:
# from sklearn.metrics import classification_report

print(classification_report(valid_y_1a, valid_y_1a_pred))
              precision    recall  f1-score   support

           0       0.62      0.09      0.16        87
           1       0.83      0.99      0.90       401

    accuracy                           0.83       488
   macro avg       0.72      0.54      0.53       488
weighted avg       0.79      0.83      0.77       488

In [65]:
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
In [66]:
fpr1a, tpr1a, thresh1a = roc_curve(valid_y_1a, valid_y_pred_1a_prob[:,1], pos_label = 1)
In [67]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1a, tpr1a, linestyle = '-', color = "red", label = "Songs to Advance")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y_1a))]

p_fpr, p_tpr, _ = roc_curve(valid_y_1a, random_probs, pos_label = 1)

plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")

# If desired
plt.legend()

plt.title("Songs to Advance ROC (Modified Variables)")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[67]:
Text(0, 0.5, 'True Positive Rate')
In [68]:
from sklearn.metrics import roc_auc_score
In [69]:
auc = roc_auc_score(valid_y_1a, valid_y_pred_1a_prob[:,1])
auc
Out[69]:
0.7337403617393299

6.5. New Songs (Modified Model)¶

New songs

In [70]:
# If not already
# new_songs_df = pd.read_csv("new_songs_v2.csv", encoding = "ISO-8859-1")
new_songs_df
Out[70]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band
In [71]:
new_songs_df_filtered_1a = new_songs_df.iloc[:, [3, 5, 9]]
new_songs_df_filtered_1a
Out[71]:
Song_Avg_Rtg Avg_Song_Age Expectation
0 96 19 42
1 90 1 36
2 6 20 2
In [72]:
new_songs_prediction_1a = logreg_model_1a.predict(new_songs_df_filtered_1a)
new_songs_prediction_1a
Out[72]:
array([1, 1, 0], dtype=int64)
In [73]:
new_songs_prediction_prob_1a = logreg_model_1a.predict_proba(new_songs_df_filtered_1a)
new_songs_prediction_prob_1a
Out[73]:
array([[0.02455597, 0.97544403],
       [0.04096668, 0.95903332],
       [0.52202598, 0.47797402]])
In [74]:
new_songs_prediction_df_1a = pd.DataFrame(new_songs_prediction_1a, columns = ["Prediction"])
new_songs_prediction_df_1a
Out[74]:
Prediction
0 1
1 1
2 0
In [75]:
new_songs_prediction_prob_df_1a = pd.DataFrame(new_songs_prediction_prob_1a, columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_1a
Out[75]:
Fail Probability Advance Probability
0 0.024556 0.975444
1 0.040967 0.959033
2 0.522026 0.477974
In [76]:
pd.concat((new_songs_df, new_songs_prediction_df_1a, new_songs_prediction_prob_df_1a), axis = 1)
Out[76]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments Prediction Fail Probability Advance Probability
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band 1 0.024556 0.975444
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band 1 0.040967 0.959033
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band 0 0.522026 0.477974

wrath2.jpg

7. Data Imbalance¶

7.1 Balance data¶

In [77]:
train_X.head()
Out[77]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
699 50.0 43.0 13.000 50.0
916 42.0 29.0 -14.800 42.0
1343 16.0 20.5 -0.637 14.7
868 44.0 46.0 -11.700 54.0
645 52.5 4.3 2.800 52.5
In [78]:
import pandas as pd
train_df = pd.concat((train_X, train_y), axis = 1)
train_df.head()
train_df["Advance"].value_counts()
Out[78]:
1    938
0    200
Name: Advance, dtype: int64
In [79]:
# Class count
count_class_0, count_class_1 = train_df["Advance"].value_counts()
print(count_class_0)
print(count_class_1)
print(len(train_df))
938
200
1138
In [80]:
# Filter
df_class_0 = train_df[train_df["Advance"] == 0]
df_class_1 = train_df[train_df["Advance"] == 1]
In [81]:
df_class_0["Advance"].value_counts()
Out[81]:
0    200
1      0
Name: Advance, dtype: int64
In [82]:
df_class_1["Advance"].value_counts()
Out[82]:
1    938
0      0
Name: Advance, dtype: int64
In [83]:
len(train_df)*0.5
Out[83]:
569.0
In [84]:
df_class_0_over = df_class_0.sample(int(len(train_df)*0.5), replace = True)
df_class_0_over
df_class_0_over["Advance"].value_counts()
Out[84]:
0    569
1      0
Name: Advance, dtype: int64
In [85]:
df_class_1_under = df_class_1.sample(int(len(train_df)*0.5), replace = True)
df_class_1_under
df_class_1_under["Advance"].value_counts()
Out[85]:
1    569
0      0
Name: Advance, dtype: int64
In [86]:
train_df_balanced = pd.concat([df_class_0_over, df_class_1_under], axis = 0)
train_df_balanced.head()
Out[86]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating Advance
873 43.00 16.0 -17.000 43.00 0
1442 49.77 10.0 -0.637 75.20 0
419 62.00 2.0 -0.637 48.10 0
1418 4.00 32.0 -51.400 49.00 0
1535 49.77 0.0 -0.637 50.88 0
In [87]:
train_df_balanced["Advance"].value_counts()
Out[87]:
0    569
1    569
Name: Advance, dtype: int64
In [88]:
train_df_balanced.head()
Out[88]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating Advance
873 43.00 16.0 -17.000 43.00 0
1442 49.77 10.0 -0.637 75.20 0
419 62.00 2.0 -0.637 48.10 0
1418 4.00 32.0 -51.400 49.00 0
1535 49.77 0.0 -0.637 50.88 0
In [89]:
train_X_2 = train_df_balanced.drop(columns = ["Advance"])
train_y_2 = train_df_balanced["Advance"].astype("category")

7.2 Logistic Regression with Balanced Data¶

In [90]:
logreg_2 = LogisticRegression()

# fit the model with data
logreg_2_model = logreg_2.fit(train_X_2, train_y_2)
In [91]:
print("The intercept is", logreg_2_model.intercept_)
print("The coefficients are", logreg_2_model.coef_)

coef_df_2 = pd.DataFrame({"Coefficient": logreg_2_model.coef_[0]}, index = train_X_2.columns)
coef_df_2
The intercept is [-1.87418204]
The coefficients are [[ 0.0335975   0.02094234  0.00442436 -0.00086124]]
Out[91]:
Coefficient
Song_Avg_Rtg 0.033597
Avg_Song_Age 0.020942
Expectation 0.004424
Artiste_Rating -0.000861
In [92]:
train_y_pred_2 = logreg_2_model.predict(train_X_2)
train_y_pred_2[0:9]
Out[92]:
array([0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
In [93]:
valid_y_pred_2 = logreg_2_model.predict(valid_X)
valid_y_pred_2[0:9]
Out[93]:
array([1, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
In [94]:
train_y_pred_prob_2 = logreg_2_model.predict_proba(train_X_2)
train_y_pred_prob_2[0:9]
Out[94]:
array([[0.55148188, 0.44851812],
       [0.51504425, 0.48495575],
       [0.44855688, 0.55144312],
       [0.79236661, 0.20763339],
       [0.56184988, 0.43815012],
       [0.54848761, 0.45151239],
       [0.47668553, 0.52331447],
       [0.5710294 , 0.4289706 ],
       [0.5152582 , 0.4847418 ]])
In [95]:
valid_y_pred_prob_2 = logreg_2_model.predict_proba(valid_X)
valid_y_pred_prob_2[0:9]
Out[95]:
array([[0.12355196, 0.87644804],
       [0.39156972, 0.60843028],
       [0.83028793, 0.16971207],
       [0.32151514, 0.67848486],
       [0.46374698, 0.53625302],
       [0.65058663, 0.34941337],
       [0.3772033 , 0.6227967 ],
       [0.18367156, 0.81632844],
       [0.37432433, 0.62567567]])

7.3 Model Evaluation with Balanced Data¶

In [96]:
#from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_train_2 = confusion_matrix(train_y_2, train_y_pred_2)
confusion_matrix_train_2
Out[96]:
array([[407, 162],
       [219, 350]], dtype=int64)
In [97]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt

confusion_matrix_train_display_2 = ConfusionMatrixDisplay(confusion_matrix_train_2, display_labels = logreg_2_model.classes_)
confusion_matrix_train_display_2.plot()
plt.grid(False)
In [98]:
accuracy_train_2 = accuracy_score(train_y_2, train_y_pred_2)
accuracy_train_2
Out[98]:
0.6652021089630932
In [99]:
#from sklearn.metrics import classification_report

print(classification_report(train_y_2, train_y_pred_2))
              precision    recall  f1-score   support

           0       0.65      0.72      0.68       569
           1       0.68      0.62      0.65       569

    accuracy                           0.67      1138
   macro avg       0.67      0.67      0.66      1138
weighted avg       0.67      0.67      0.66      1138

In [100]:
# from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_valid_2 = confusion_matrix(valid_y, valid_y_pred_2)
confusion_matrix_valid_2
Out[100]:
array([[ 67,  20],
       [148, 253]], dtype=int64)
In [101]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display_2 = ConfusionMatrixDisplay(confusion_matrix_valid_2, display_labels = logreg_2_model.classes_)
confusion_matrix_valid_display_2.plot()
plt.grid(False)
In [102]:
# from sklearn.metrics import classification_report

print(classification_report(valid_y, valid_y_pred_2))
              precision    recall  f1-score   support

           0       0.31      0.77      0.44        87
           1       0.93      0.63      0.75       401

    accuracy                           0.66       488
   macro avg       0.62      0.70      0.60       488
weighted avg       0.82      0.66      0.70       488

In [103]:
#from sklearn import metrics
#import matplotlib.pyplot as plt
#from sklearn.metrics import roc_curve
In [104]:
fpr1, tpr1, thresh1 = roc_curve(valid_y, valid_y_pred_prob_2[:,1], pos_label = 1)
In [105]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = '-', color = "red", label = "Songs to Advance")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y))]

p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)

plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")

# If desired
plt.legend()

plt.title("Songs to Advance ROC (Balanced)")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[105]:
Text(0, 0.5, 'True Positive Rate')
In [106]:
#from sklearn.metrics import roc_auc_score
In [107]:
auc_2 = roc_auc_score(valid_y, valid_y_pred_prob_2[:,1])
auc_2
Out[107]:
0.736664086909164

7.4 Prediction with Balanced Data Model¶

In [108]:
new_songs_prediction_2 = logreg_2_model.predict(new_songs_df_filtered)
new_songs_prediction_2
Out[108]:
array([1, 1, 0], dtype=int64)
In [109]:
new_songs_prediction_prob_2 = logreg_2_model.predict_proba(new_songs_df_filtered)
new_songs_prediction_prob_2
Out[109]:
array([[0.1350072 , 0.8649928 ],
       [0.22230228, 0.77769772],
       [0.79101029, 0.20898971]])

Be careful about the order

In [110]:
new_songs_prediction_prob_df_2 = pd.DataFrame(new_songs_prediction_prob_2, columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_2
Out[110]:
Fail Probability Advance Probability
0 0.135007 0.864993
1 0.222302 0.777698
2 0.791010 0.208990
In [111]:
new_songs_prediction_df_2 = pd.DataFrame(new_songs_prediction_2, columns = ["Prediction"])
new_songs_prediction_df_2
Out[111]:
Prediction
0 1
1 1
2 0
In [112]:
pd.concat((new_songs_df, new_songs_prediction_prob_df_2, new_songs_prediction_df_2), axis = 1)
Out[112]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments Fail Probability Advance Probability Prediction
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band 0.135007 0.864993 1
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band 0.222302 0.777698 1
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band 0.791010 0.208990 0

8. More Stuff¶

8.1 Multicollinearity (again)¶

In [113]:
from statsmodels.stats.outliers_influence import variance_inflation_factor 

Create VIF dataframe, then calculate VIF for each variable

In [114]:
vif_data = pd.DataFrame() 
vif_data["feature"] = train_X_2.columns 
  
vif_data["VIF"] = [variance_inflation_factor(train_X_2.values, i) 
                          for i in range(len(train_X_2.columns))] 
  
vif_data
Out[114]:
feature VIF
0 Song_Avg_Rtg 13.957249
1 Avg_Song_Age 2.285344
2 Expectation 1.353987
3 Artiste_Rating 14.626662
In [115]:
train_X_2.corr()
Out[115]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
Song_Avg_Rtg 1.000000 0.136549 0.657376 0.583989
Avg_Song_Age 0.136549 1.000000 0.081988 0.178075
Expectation 0.657376 0.081988 1.000000 0.331929
Artiste_Rating 0.583989 0.178075 0.331929 1.000000

Remove artiste rating

In [116]:
train_X_2.head()
Out[116]:
Song_Avg_Rtg Avg_Song_Age Expectation Artiste_Rating
873 43.00 16.0 -17.000 43.00
1442 49.77 10.0 -0.637 75.20
419 62.00 2.0 -0.637 48.10
1418 4.00 32.0 -51.400 49.00
1535 49.77 0.0 -0.637 50.88
In [117]:
train_X_3 = train_X_2.iloc[:,0:3]
train_X_3.head()
Out[117]:
Song_Avg_Rtg Avg_Song_Age Expectation
873 43.00 16.0 -17.000
1442 49.77 10.0 -0.637
419 62.00 2.0 -0.637
1418 4.00 32.0 -51.400
1535 49.77 0.0 -0.637
In [118]:
vif_data_3 = pd.DataFrame() 
vif_data_3["feature"] = train_X_3.columns 
  
vif_data_3["VIF"] = [variance_inflation_factor(train_X_3.values, i) 
                          for i in range(len(train_X_3.columns))] 
  
vif_data_3
Out[118]:
feature VIF
0 Song_Avg_Rtg 2.167299
1 Avg_Song_Age 2.182946
2 Expectation 1.044139

8.2 Another Model¶

In [119]:
train_y_3 = train_y_2
valid_X_3 = valid_X.iloc[:, 0:3]
valid_y_3 = valid_y
In [120]:
logreg_3 = LogisticRegression(penalty = "l2", C = 666)

# fit the model with data
logreg_3_model = logreg_3.fit(train_X_3, train_y_3)
In [121]:
print("The intercept is", logreg_3_model.intercept_)
print("The coefficients are", logreg_3_model.coef_)

coef_df_3 = pd.DataFrame({"Coefficient": logreg_3_model.coef_[0]}, index = train_X_3.columns)
coef_df_3
The intercept is [-1.89646513]
The coefficients are [[0.0331629  0.02086211 0.00450225]]
Out[121]:
Coefficient
Song_Avg_Rtg 0.033163
Avg_Song_Age 0.020862
Expectation 0.004502

8.3 Model Evaluation with New Model¶

In [122]:
train_y_pred_3 = logreg_3_model.predict(train_X_3)
train_y_pred_3[0:9]
Out[122]:
array([0, 0, 1, 0, 0, 0, 1, 0, 0], dtype=int64)
In [123]:
valid_y_pred_3 = logreg_3_model.predict(valid_X_3)
valid_y_pred_3[0:9]
Out[123]:
array([1, 1, 0, 1, 1, 0, 1, 1, 1], dtype=int64)
In [124]:
train_y_pred_prob_3 = logreg_3_model.predict_proba(train_X_3)
train_y_pred_prob_3[0:9]
Out[124]:
array([[0.55310029, 0.44689971],
       [0.51004731, 0.48995269],
       [0.45053962, 0.54946038],
       [0.79045028, 0.20954972],
       [0.561885  , 0.438115  ],
       [0.54642327, 0.45357673],
       [0.48260282, 0.51739718],
       [0.56999484, 0.43000516],
       [0.51455868, 0.48544132]])
In [125]:
valid_y_pred_prob_3 = logreg_3_model.predict_proba(valid_X_3)
valid_y_pred_prob_3[0:9]
Out[125]:
array([[0.12246934, 0.87753066],
       [0.39325181, 0.60674819],
       [0.83322383, 0.16677617],
       [0.31881743, 0.68118257],
       [0.46132636, 0.53867364],
       [0.6503909 , 0.3496091 ],
       [0.37628605, 0.62371395],
       [0.17806823, 0.82193177],
       [0.3754707 , 0.6245293 ]])
In [126]:
#from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_train_3 = confusion_matrix(train_y_3, train_y_pred_3)
confusion_matrix_train_3
Out[126]:
array([[407, 162],
       [218, 351]], dtype=int64)
In [127]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt

confusion_matrix_train_display_3 = ConfusionMatrixDisplay(confusion_matrix_train_3, 
                                                          display_labels = logreg_3_model.classes_)
confusion_matrix_train_display_3.plot()
plt.grid(False)
In [128]:
accuracy_train_3 = accuracy_score(train_y_3, train_y_pred_3)
accuracy_train_3
Out[128]:
0.6660808435852372
In [129]:
#from sklearn.metrics import classification_report

print(classification_report(train_y_3, train_y_pred_3))
              precision    recall  f1-score   support

           0       0.65      0.72      0.68       569
           1       0.68      0.62      0.65       569

    accuracy                           0.67      1138
   macro avg       0.67      0.67      0.67      1138
weighted avg       0.67      0.67      0.67      1138

In [130]:
# from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix_valid_3 = confusion_matrix(valid_y_3, valid_y_pred_3)
confusion_matrix_valid_3
Out[130]:
array([[ 66,  21],
       [149, 252]], dtype=int64)
In [131]:
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt

confusion_matrix_valid_display_3 = ConfusionMatrixDisplay(confusion_matrix_valid_3, 
                                                          display_labels = logreg_3_model.classes_)
confusion_matrix_valid_display_3.plot()
plt.grid(False)
In [132]:
# from sklearn.metrics import classification_report

print(classification_report(valid_y_3, valid_y_pred_3))
              precision    recall  f1-score   support

           0       0.31      0.76      0.44        87
           1       0.92      0.63      0.75       401

    accuracy                           0.65       488
   macro avg       0.62      0.69      0.59       488
weighted avg       0.81      0.65      0.69       488

In [133]:
#from sklearn import metrics
#import matplotlib.pyplot as plt
#from sklearn.metrics import roc_curve
In [134]:
#Get the fpr, tpr and thresholds (i.e. cutoffs)
fpr1, tpr1, thresh1 = roc_curve(valid_y_3, valid_y_pred_prob_3[:,1], pos_label = 1)
In [135]:
import matplotlib.pyplot as plt
plt.style.use("seaborn")

plt.plot(fpr1, tpr1, linestyle = '-', color = "blue", label = "Songs to Advance")

# roc curve for tpr = fpr (random line) 
random_probs = [0 for i in range(len(valid_y_3))]

p_fpr, p_tpr, _ = roc_curve(valid_y, random_probs, pos_label = 1)

plt.plot(p_fpr, p_tpr, linestyle = '--', color = "green", label = "Random Advancement")

# If desired
plt.legend()

plt.title("Songs to Advance ROC (Balanced) with Regularisation")

plt.xlabel("False Positive Rate")

plt.ylabel("True Positive Rate")

# to save the plot
# plt.savefig("whatever_name",dpi = 300)
Out[135]:
Text(0, 0.5, 'True Positive Rate')
In [136]:
#from sklearn.metrics import roc_auc_score
In [137]:
auc_3 = roc_auc_score(valid_y_3, valid_y_pred_prob_3[:,1])
auc_3
Out[137]:
0.737409350187749

8.4 Optimum Cutoff¶

In [138]:
# Get the optimal index position in the array
optimal_index = np.argmax(tpr1 - fpr1)
print(optimal_index)
50
In [139]:
optimal_cutoff = thresh1[optimal_index]
optimal_cutoff
Out[139]:
0.48131717567164345
In [140]:
#Thresh (threshold) = cutoff = [0,1] because it is the probability threshold. 
#But scikit learn adds +1 to the last number in the threshold array to cover the full range [0, 1]. 
thresh1[0:10]
Out[140]:
array([1.9208441 , 0.9208441 , 0.87259103, 0.87114783, 0.81221349,
       0.81179973, 0.777871  , 0.77731994, 0.76689277, 0.76640133])

New confusion matrix for training set

In [141]:
train_y_pred_prob_3a = (train_y_pred_prob_3[:, 1] > optimal_cutoff)
confusion_matrix_train_3a = confusion_matrix(train_y_3, train_y_pred_prob_3a)
confusion_matrix_train_3a
Out[141]:
array([[370, 199],
       [202, 367]], dtype=int64)
In [142]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt

confusion_matrix_train_display_3a = ConfusionMatrixDisplay(confusion_matrix_train_3a, 
                                                          display_labels = logreg_3_model.classes_)
confusion_matrix_train_display_3a.plot()
plt.grid(False)
In [143]:
def optimum_cutoff_pred_fn(prob):
    if prob > optimal_cutoff:
        return 1
    elif prob <= optimal_cutoff:
        return 0

train_y_pred_prob_3a_df = pd.DataFrame(train_y_pred_prob_3a, columns = ["train_prob_3a"])
train_y_pred_prob_3a_df.head()

train_y_pred_3a = train_y_pred_prob_3a_df["train_prob_3a"].apply(optimum_cutoff_pred_fn)
print(classification_report(train_y_3, train_y_pred_3a))
              precision    recall  f1-score   support

           0       0.65      0.65      0.65       569
           1       0.65      0.64      0.65       569

    accuracy                           0.65      1138
   macro avg       0.65      0.65      0.65      1138
weighted avg       0.65      0.65      0.65      1138

New confusion matrix for validation set

In [144]:
valid_y_pred_prob_3a = (valid_y_pred_prob_3[:, 1] > optimal_cutoff)
confusion_matrix_valid_3a = confusion_matrix(valid_y_3, valid_y_pred_prob_3a)
confusion_matrix_valid_3a
Out[144]:
array([[ 65,  22],
       [133, 268]], dtype=int64)
In [145]:
#from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#import matplotlib.pyplot as plt

confusion_matrix_valid_display_3a = ConfusionMatrixDisplay(confusion_matrix_valid_3a, 
                                                          display_labels = logreg_3_model.classes_)
confusion_matrix_valid_display_3a.plot()
plt.grid(False)
In [146]:
valid_y_pred_prob_3a_df = pd.DataFrame(valid_y_pred_prob_3a, columns = ["valid_prob_3a"])
valid_y_pred_prob_3a_df.head()

valid_y_pred_3a = valid_y_pred_prob_3a_df["valid_prob_3a"].apply(optimum_cutoff_pred_fn)
print(classification_report(valid_y_3, valid_y_pred_3a))
              precision    recall  f1-score   support

           0       0.33      0.75      0.46        87
           1       0.92      0.67      0.78       401

    accuracy                           0.68       488
   macro avg       0.63      0.71      0.62       488
weighted avg       0.82      0.68      0.72       488

8.5 Prediction with New Model¶

In [147]:
new_songs_df_filtered_3 = new_songs_df_filtered.iloc[:, 0:3]
new_songs_prediction_3 = logreg_3_model.predict(new_songs_df_filtered_3)
new_songs_prediction_3
Out[147]:
array([1, 1, 0], dtype=int64)
In [148]:
new_songs_prediction_prob_3 = logreg_3_model.predict_proba(new_songs_df_filtered_3)
new_songs_prediction_prob_3
Out[148]:
array([[0.13323568, 0.86676432],
       [0.2190624 , 0.7809376 ],
       [0.78095512, 0.21904488]])

Be careful about the order

In [149]:
new_songs_prediction_prob_df_3 = pd.DataFrame(new_songs_prediction_prob_3, 
                                              columns = ["Fail Probability", "Advance Probability"])
new_songs_prediction_prob_df_3
Out[149]:
Fail Probability Advance Probability
0 0.133236 0.866764
1 0.219062 0.780938
2 0.780955 0.219045
In [150]:
new_songs_prediction_df_3 = pd.DataFrame(new_songs_prediction_3, columns = ["Prediction"])
new_songs_prediction_df_3
Out[150]:
Prediction
0 1
1 1
2 0

Prediction using the optimum cutoff No change in the predictions

In [151]:
new_songs_with_optimum_prediction = pd.concat((new_songs_df, new_songs_prediction_prob_df_3, new_songs_prediction_df_3), axis = 1)
new_songs_with_optimum_prediction
Out[151]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments Fail Probability Advance Probability Prediction
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band 0.133236 0.866764 1
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band 0.219062 0.780938 1
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band 0.780955 0.219045 0
In [152]:
new_songs_with_optimum_prediction["Optimum_Prediction"] = new_songs_with_optimum_prediction["Advance Probability"].apply(optimum_cutoff_pred_fn)
new_songs_with_optimum_prediction
Out[152]:
No Song_Title Artiste Song_Avg_Rtg Year Avg_Song_Age Advance Bottom Elimination Expectation Artiste_Rating Comments Fail Probability Advance Probability Prediction Optimum_Prediction
0 6661 Walk With Me In Hell Lamb of God 96 2004 19 NaN NaN NaN 42 90 Classic song from a legendary band 0.133236 0.866764 1 1
1 6662 The Watcher Arch Enemy 90 2022 1 NaN NaN NaN 36 90 Fantastic song from a legendary band 0.219062 0.780938 1 1
2 6663 Frantic Metallica 6 2003 20 NaN NaN NaN 2 100 Zzz song from a legendary band 0.780955 0.219045 0 0

The choice is clear :-)¶