Super Smash Parseltongue¶

smash_logo.png

Download data

Back to the Spell Book

1. Import¶

In [1]:
import pandas as pd
from pandas import DataFrame
In [2]:
smash_df = pd.read_csv("smash_3.csv")
smash_df.head()
Out[2]:
character game regular_fall fast_fall weight walk_speed run_speed full_hop_height short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Amateur

2. Description¶

In [3]:
smash_df.shape
Out[3]:
(72, 13)
In [4]:
len(smash_df["game"])
Out[4]:
72

Unique values.

In [5]:
smash_df["game"] = smash_df["game"].astype("category")
smash_df["game"].info
Out[5]:
<bound method Series.info of 0         Super Mario
1         Donkey Kong
2               Zelda
3             Metroid
4             Metroid
           ...       
67    Animal Crossing
68            Pokemon
69        Super Mario
70       Dragon Quest
71      Banjo Kazooie
Name: game, Length: 72, dtype: category
Categories (30, object): ['Animal Crossing', 'Banjo Kazooie', 'Bayonetta', 'Castlevania', ..., 'Super Mario', 'Wii', 'Xenoblade', 'Zelda']>
In [6]:
smash_df["game"].value_counts()
Out[6]:
Super Mario        10
Pokemon             7
Zelda               6
Fire Emblem         6
Wii                 4
Kirby               4
Metroid             4
Donkey Kong         3
Animal Crossing     2
Icarus              2
Castlevania         2
Street Fighter      2
Star Fox            2
F-Zero              2
Punch Out           1
Xenoblade           1
Bayonetta           1
Stack Up            1
Splatoon            1
Sonic               1
Earthbound          1
Nintendo            1
Mother              1
Banjo Kazooie       1
Metal Gear          1
Mega Man            1
Dragon Quest        1
Duck Hunt           1
Final Fantasy       1
Pikmin              1
Name: game, dtype: int64
In [7]:
smash_df["game"].value_counts(sort = True)
Out[7]:
Super Mario        10
Pokemon             7
Zelda               6
Fire Emblem         6
Wii                 4
Kirby               4
Metroid             4
Donkey Kong         3
Animal Crossing     2
Icarus              2
Castlevania         2
Street Fighter      2
Star Fox            2
F-Zero              2
Punch Out           1
Xenoblade           1
Bayonetta           1
Stack Up            1
Splatoon            1
Sonic               1
Earthbound          1
Nintendo            1
Mother              1
Banjo Kazooie       1
Metal Gear          1
Mega Man            1
Dragon Quest        1
Duck Hunt           1
Final Fantasy       1
Pikmin              1
Name: game, dtype: int64
In [8]:
smash_df["game"].value_counts(sort = True, normalize = True)
Out[8]:
Super Mario        0.138889
Pokemon            0.097222
Zelda              0.083333
Fire Emblem        0.083333
Wii                0.055556
Kirby              0.055556
Metroid            0.055556
Donkey Kong        0.041667
Animal Crossing    0.027778
Icarus             0.027778
Castlevania        0.027778
Street Fighter     0.027778
Star Fox           0.027778
F-Zero             0.027778
Punch Out          0.013889
Xenoblade          0.013889
Bayonetta          0.013889
Stack Up           0.013889
Splatoon           0.013889
Sonic              0.013889
Earthbound         0.013889
Nintendo           0.013889
Mother             0.013889
Banjo Kazooie      0.013889
Metal Gear         0.013889
Mega Man           0.013889
Dragon Quest       0.013889
Duck Hunt          0.013889
Final Fantasy      0.013889
Pikmin             0.013889
Name: game, dtype: float64
In [9]:
smash_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   character          72 non-null     object  
 1   game               72 non-null     category
 2   regular_fall       72 non-null     float64 
 3   fast_fall          72 non-null     float64 
 4   weight             72 non-null     int64   
 5   walk_speed         72 non-null     float64 
 6   run_speed          72 non-null     float64 
 7   full_hop_height    70 non-null     float64 
 8   short_hop_height   70 non-null     float64 
 9   double_hop_height  70 non-null     float64 
 10  before             72 non-null     int64   
 11  after              72 non-null     int64   
 12  expertise          72 non-null     object  
dtypes: category(1), float64(7), int64(3), object(2)
memory usage: 8.2+ KB
In [10]:
smash_df.describe()
Out[10]:
regular_fall fast_fall weight walk_speed run_speed full_hop_height short_hop_height double_hop_height before after
count 72.000000 72.000000 72.00000 72.000000 72.000000 70.000000 70.000000 70.000000 72.000000 72.000000
mean 1.600069 2.546306 96.25000 1.104171 1.806428 33.211864 16.027429 34.440000 6.319444 7.722222
std 0.222869 0.353823 13.93218 0.213718 0.391566 5.112580 2.302452 6.379117 2.582860 1.576430
min 0.980000 1.568000 62.00000 0.620000 1.180000 19.790000 11.260000 19.790000 2.000000 5.000000
25% 1.472500 2.240000 88.00000 0.924000 1.591250 30.885375 14.500000 30.782500 4.000000 7.000000
50% 1.620000 2.592000 95.50000 1.133000 1.722500 33.000000 16.225000 33.500000 6.000000 8.000000
75% 1.770000 2.820000 104.50000 1.259000 1.979750 35.575000 17.482500 36.832500 8.250000 9.000000
max 2.100000 3.360000 135.00000 1.575000 3.850000 50.510000 22.110000 57.350000 10.000000 10.000000
In [11]:
smash_df["expertise"] = smash_df["expertise"].astype("category")
In [12]:
smash_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   character          72 non-null     object  
 1   game               72 non-null     category
 2   regular_fall       72 non-null     float64 
 3   fast_fall          72 non-null     float64 
 4   weight             72 non-null     int64   
 5   walk_speed         72 non-null     float64 
 6   run_speed          72 non-null     float64 
 7   full_hop_height    70 non-null     float64 
 8   short_hop_height   70 non-null     float64 
 9   double_hop_height  70 non-null     float64 
 10  before             72 non-null     int64   
 11  after              72 non-null     int64   
 12  expertise          72 non-null     category
dtypes: category(2), float64(7), int64(3), object(1)
memory usage: 7.9+ KB
In [13]:
smash_df.describe(include = "category")
Out[13]:
game expertise
count 72 72
unique 30 2
top Super Mario Expert
freq 10 45

Rename columns.

In [14]:
smash_df = smash_df.rename(columns = {"full_hop_height": "FULL_HOP_HEIGHT_RENAME"})
smash_df.head()
Out[14]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Amateur

3. Filtering¶

3.1 Single Criterion¶

In [15]:
zelda_df = smash_df[smash_df["game"] == "Zelda"]
zelda_df
Out[15]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
16 sheik Zelda 1.75 2.800 78 1.470 2.420 39.00 18.75 40.00 6 5 Expert
17 zelda Zelda 1.35 2.160 85 0.914 1.430 31.55 15.24 31.55 4 7 Amateur
22 young link Zelda 1.80 2.880 88 1.260 1.749 33.66 16.26 33.66 7 10 Expert
23 ganondorf Zelda 1.65 2.640 118 0.767 1.340 25.49 12.24 26.00 10 8 Expert
42 toon link Zelda 1.38 2.208 91 1.288 1.906 33.80 16.32 33.80 3 7 Amateur
In [16]:
zelda_df.shape
Out[16]:
(6, 13)
In [17]:
zelda_df.describe()
Out[17]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
count 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.00000 6.000000 6.000000 6.000000
mean 1.588333 2.621333 94.000000 1.157667 1.729833 31.883333 15.36500 32.335000 6.666667 7.333333
std 0.187127 0.362804 14.546477 0.262673 0.396977 4.798678 2.31917 4.785912 2.943920 1.632993
min 1.350000 2.160000 78.000000 0.767000 1.340000 25.490000 12.24000 26.000000 3.000000 5.000000
25% 1.435000 2.316000 85.750000 0.997250 1.456000 28.737500 13.84500 29.637500 4.500000 7.000000
50% 1.625000 2.720000 89.500000 1.253500 1.641500 32.605000 15.75000 32.605000 6.500000 7.000000
75% 1.725000 2.860000 100.750000 1.281000 1.866750 33.765000 16.30500 33.765000 9.250000 7.750000
max 1.800000 3.040000 118.000000 1.470000 2.420000 39.000000 18.75000 40.000000 10.000000 10.000000

Another way

In [18]:
zelda_df_2 = smash_df.query("game == 'Zelda'")
zelda_df_2
Out[18]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
16 sheik Zelda 1.75 2.800 78 1.470 2.420 39.00 18.75 40.00 6 5 Expert
17 zelda Zelda 1.35 2.160 85 0.914 1.430 31.55 15.24 31.55 4 7 Amateur
22 young link Zelda 1.80 2.880 88 1.260 1.749 33.66 16.26 33.66 7 10 Expert
23 ganondorf Zelda 1.65 2.640 118 0.767 1.340 25.49 12.24 26.00 10 8 Expert
42 toon link Zelda 1.38 2.208 91 1.288 1.906 33.80 16.32 33.80 3 7 Amateur
In [19]:
zelda_df_2.shape
Out[19]:
(6, 13)
In [20]:
zelda_df_2.describe()
Out[20]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
count 6.000000 6.000000 6.000000 6.000000 6.000000 6.000000 6.00000 6.000000 6.000000 6.000000
mean 1.588333 2.621333 94.000000 1.157667 1.729833 31.883333 15.36500 32.335000 6.666667 7.333333
std 0.187127 0.362804 14.546477 0.262673 0.396977 4.798678 2.31917 4.785912 2.943920 1.632993
min 1.350000 2.160000 78.000000 0.767000 1.340000 25.490000 12.24000 26.000000 3.000000 5.000000
25% 1.435000 2.316000 85.750000 0.997250 1.456000 28.737500 13.84500 29.637500 4.500000 7.000000
50% 1.625000 2.720000 89.500000 1.253500 1.641500 32.605000 15.75000 32.605000 6.500000 7.000000
75% 1.725000 2.860000 100.750000 1.281000 1.866750 33.765000 16.30500 33.765000 9.250000 7.750000
max 1.800000 3.040000 118.000000 1.470000 2.420000 39.000000 18.75000 40.000000 10.000000 10.000000
In [21]:
round(zelda_df["run_speed"].mean(),5)
Out[21]:
1.72983

Using a list.

In [22]:
values = ["Donkey Kong", "Animal Crossing"]
dk_ac_df = smash_df[smash_df["game"].isin(values)]
dk_ac_df
Out[22]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
35 diddy kong Donkey Kong 1.75 2.800 90 1.313 2.006 41.21 19.90 41.21 10 9 Expert
44 villager Animal Crossing 1.32 2.112 92 1.092 1.397 32.50 15.69 32.50 8 9 Expert
66 king k rool Donkey Kong 1.70 2.720 133 0.903 1.485 33.00 13.50 32.61 3 8 Amateur
67 isabelle Animal Crossing 1.30 2.080 88 1.140 1.480 32.50 15.69 32.50 9 7 Expert

Another method

In [23]:
dk_ac_df_2 = smash_df.query("game == 'Donkey Kong' | game == 'Animal Crossing'")
dk_ac_df_2
Out[23]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
35 diddy kong Donkey Kong 1.75 2.800 90 1.313 2.006 41.21 19.90 41.21 10 9 Expert
44 villager Animal Crossing 1.32 2.112 92 1.092 1.397 32.50 15.69 32.50 8 9 Expert
66 king k rool Donkey Kong 1.70 2.720 133 0.903 1.485 33.00 13.50 32.61 3 8 Amateur
67 isabelle Animal Crossing 1.30 2.080 88 1.140 1.480 32.50 15.69 32.50 9 7 Expert

3.2 Multiple Criteria¶

In [24]:
expert_metroid_df = smash_df[(smash_df["game"] == "Metroid") & (smash_df["expertise"] == "Expert")]
expert_metroid_df
Out[24]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.0 18.00 37.0 6 9 Expert
31 zero suit samus Metroid 1.70 2.720 80 1.470 2.310 44.5 21.35 44.2 10 10 Expert
In [25]:
expert_metroid_df_2 = smash_df.query("game == 'Metroid' & expertise == 'Expert'")
expert_metroid_df_2                  
Out[25]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.0 18.00 37.0 6 9 Expert
31 zero suit samus Metroid 1.70 2.720 80 1.470 2.310 44.5 21.35 44.2 10 10 Expert
In [26]:
fast_supermario_df = smash_df[(smash_df["game"] == "Super Mario") & (smash_df["run_speed"] > 2)]
fast_supermario_df
Out[26]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
5 yoshi Super Mario 1.29 2.064 104 1.208 2.046 36.09 14.43 51.56 5 9 Amateur
In [27]:
fast_supermario_df_2 = smash_df.query("game == 'Super Mario' & run_speed > 2")
fast_supermario_df_2
Out[27]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
5 yoshi Super Mario 1.29 2.064 104 1.208 2.046 36.09 14.43 51.56 5 9 Amateur

Another example.

In [28]:
fast_df = smash_df[(smash_df["fast_fall"] > 3) & (smash_df["run_speed"] > 2)]
fast_df
Out[28]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
7 fox Star Fox 2.10 3.36 77 1.523 2.402 35.0 16.40 37.0 6 8 Expert
47 little mac Punch Out 1.95 3.12 87 1.386 2.464 26.0 12.53 26.0 6 8 Expert
In [29]:
fast_df_2 = smash_df.query("fast_fall > 3 & run_speed > 2")
fast_df_2
Out[29]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
7 fox Star Fox 2.10 3.36 77 1.523 2.402 35.0 16.40 37.0 6 8 Expert
47 little mac Punch Out 1.95 3.12 87 1.386 2.464 26.0 12.53 26.0 6 8 Expert

3.3 Columns¶

Get variable names. If we already know which columns to filter, just use iloc.

In [30]:
smash_df.columns.values.tolist()
Out[30]:
['character',
 'game',
 'regular_fall',
 'fast_fall',
 'weight',
 'walk_speed',
 'run_speed',
 'FULL_HOP_HEIGHT_RENAME',
 'short_hop_height',
 'double_hop_height',
 'before',
 'after',
 'expertise']

Get indices.

In [31]:
index_dictionary = {}
for col in smash_df.columns:
    index_dictionary[col] = smash_df.columns.get_loc(col)
print(index_dictionary)
{'character': 0, 'game': 1, 'regular_fall': 2, 'fast_fall': 3, 'weight': 4, 'walk_speed': 5, 'run_speed': 6, 'FULL_HOP_HEIGHT_RENAME': 7, 'short_hop_height': 8, 'double_hop_height': 9, 'before': 10, 'after': 11, 'expertise': 12}

Convert to df

In [32]:
index_dictionary_df = pd.DataFrame(index_dictionary, index=[0])
index_dictionary_df
Out[32]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 0 1 2 3 4 5 6 7 8 9 10 11 12

Transpose

In [33]:
index_dictionary_df_transposed = index_dictionary_df.transpose()
index_dictionary_df_transposed
Out[33]:
0
character 0
game 1
regular_fall 2
fast_fall 3
weight 4
walk_speed 5
run_speed 6
FULL_HOP_HEIGHT_RENAME 7
short_hop_height 8
double_hop_height 9
before 10
after 11
expertise 12

If it is too long to display, try:

print(index_dictionary_df_transposed.to_string())

In [34]:
print(index_dictionary_df_transposed.to_string())
                         0
character                0
game                     1
regular_fall             2
fast_fall                3
weight                   4
walk_speed               5
run_speed                6
FULL_HOP_HEIGHT_RENAME   7
short_hop_height         8
double_hop_height        9
before                  10
after                   11
expertise               12
In [35]:
smash_df.iloc[:,3:8]
Out[35]:
fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME
0 2.400 98 1.155 1.760 36.33
1 2.608 127 1.365 1.873 34.00
2 3.040 104 1.247 1.534 27.80
3 2.168 108 1.115 1.654 37.00
4 2.168 108 1.115 1.654 37.00
... ... ... ... ... ...
67 2.080 88 1.140 1.480 32.50
68 2.816 116 0.620 1.180 31.60
69 2.730 112 0.760 1.720 37.40
70 2.512 101 0.980 1.840 NaN
71 2.816 106 1.060 2.180 NaN

72 rows × 5 columns

In [36]:
smash_df.iloc[:,[0, 3, 5, 7, 9]]
Out[36]:
character fast_fall walk_speed FULL_HOP_HEIGHT_RENAME double_hop_height
0 mario 2.400 1.155 36.33 36.33
1 donkey kong 2.608 1.365 34.00 35.50
2 link 3.040 1.247 27.80 29.00
3 samus 2.168 1.115 37.00 37.00
4 dark samus 2.168 1.115 37.00 37.00
... ... ... ... ... ...
67 isabelle 2.080 1.140 32.50 32.50
68 incineroar 2.816 0.620 31.60 32.80
69 piranha plant 2.730 0.760 37.40 38.52
70 hero 2.512 0.980 NaN NaN
71 banjo and kazooie 2.816 1.060 NaN NaN

72 rows × 5 columns

In [37]:
smash_df_1 = smash_df.iloc[:, [0, 5, 6, 12]]
smash_df_1
Out[37]:
character walk_speed run_speed expertise
0 mario 1.155 1.760 Amateur
1 donkey kong 1.365 1.873 Expert
2 link 1.247 1.534 Expert
3 samus 1.115 1.654 Expert
4 dark samus 1.115 1.654 Amateur
... ... ... ... ...
67 isabelle 1.140 1.480 Expert
68 incineroar 0.620 1.180 Expert
69 piranha plant 0.760 1.720 Expert
70 hero 0.980 1.840 Expert
71 banjo and kazooie 1.060 2.180 Expert

72 rows × 4 columns

In [38]:
smash_df_2 = smash_df.loc[:, "character":"weight"]
smash_df_2
Out[38]:
character game regular_fall fast_fall weight
0 mario Super Mario 1.50 2.400 98
1 donkey kong Donkey Kong 1.63 2.608 127
2 link Zelda 1.60 3.040 104
3 samus Metroid 1.33 2.168 108
4 dark samus Metroid 1.33 2.168 108
... ... ... ... ... ...
67 isabelle Animal Crossing 1.30 2.080 88
68 incineroar Pokemon 1.76 2.816 116
69 piranha plant Super Mario 1.95 2.730 112
70 hero Dragon Quest 1.57 2.512 101
71 banjo and kazooie Banjo Kazooie 1.76 2.816 106

72 rows × 5 columns

In [39]:
smash_df_3 = smash_df.loc[:, ["character", "game", "regular_fall", "fast_fall", "weight", "expertise", 
                              "run_speed", "FULL_HOP_HEIGHT_RENAME"]]
smash_df_3
Out[39]:
character game regular_fall fast_fall weight expertise run_speed FULL_HOP_HEIGHT_RENAME
0 mario Super Mario 1.50 2.400 98 Amateur 1.760 36.33
1 donkey kong Donkey Kong 1.63 2.608 127 Expert 1.873 34.00
2 link Zelda 1.60 3.040 104 Expert 1.534 27.80
3 samus Metroid 1.33 2.168 108 Expert 1.654 37.00
4 dark samus Metroid 1.33 2.168 108 Amateur 1.654 37.00
... ... ... ... ... ... ... ... ...
67 isabelle Animal Crossing 1.30 2.080 88 Expert 1.480 32.50
68 incineroar Pokemon 1.76 2.816 116 Expert 1.180 31.60
69 piranha plant Super Mario 1.95 2.730 112 Expert 1.720 37.40
70 hero Dragon Quest 1.57 2.512 101 Expert 1.840 NaN
71 banjo and kazooie Banjo Kazooie 1.76 2.816 106 Expert 2.180 NaN

72 rows × 8 columns

For more customised ranges, use numpy.

Notice ranges take 1 less.

In [40]:
import numpy as np
In [41]:
smash_df_4 = smash_df.iloc[:, np.r_[0, 5, 6:13]]
smash_df_4.head()
Out[41]:
character walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus 1.115 1.654 37.00 18.00 37.00 2 5 Amateur

Notice ranges take 1 less.

In [42]:
smash_df_5 = smash_df.iloc[:, np.r_[0:5, 12, 6:8]]
smash_df_5.head()
Out[42]:
character game regular_fall fast_fall weight expertise run_speed FULL_HOP_HEIGHT_RENAME
0 mario Super Mario 1.50 2.400 98 Amateur 1.760 36.33
1 donkey kong Donkey Kong 1.63 2.608 127 Expert 1.873 34.00
2 link Zelda 1.60 3.040 104 Expert 1.534 27.80
3 samus Metroid 1.33 2.168 108 Expert 1.654 37.00
4 dark samus Metroid 1.33 2.168 108 Amateur 1.654 37.00

3.4 Rows¶

First 5 rows. Notice difference between loc and iloc

In [43]:
smash_df.loc[0:4]
Out[43]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Amateur
In [44]:
smash_df.iloc[0:5]
Out[44]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Amateur
In [45]:
smash_df.iloc[0:4]
Out[45]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
In [46]:
smash_df.iloc[0:4, 5:8]
Out[46]:
walk_speed run_speed FULL_HOP_HEIGHT_RENAME
0 1.155 1.760 36.33
1 1.365 1.873 34.00
2 1.247 1.534 27.80
3 1.115 1.654 37.00
In [47]:
smash_df.iloc[0:4]["double_hop_height"]
Out[47]:
0    36.33
1    35.50
2    29.00
3    37.00
Name: double_hop_height, dtype: float64
In [48]:
smash_df.iloc[3,0:4]
Out[48]:
character         samus
game            Metroid
regular_fall       1.33
fast_fall         2.168
Name: 3, dtype: object

3.5 Concatenate¶

In [49]:
pd.concat((smash_df.iloc[:, 0:2], smash_df.iloc[:, 7:9]), axis = 1)
Out[49]:
character game FULL_HOP_HEIGHT_RENAME short_hop_height
0 mario Super Mario 36.33 17.54
1 donkey kong Donkey Kong 34.00 17.30
2 link Zelda 27.80 13.38
3 samus Metroid 37.00 18.00
4 dark samus Metroid 37.00 18.00
... ... ... ... ...
67 isabelle Animal Crossing 32.50 15.69
68 incineroar Pokemon 31.60 14.40
69 piranha plant Super Mario 37.40 17.50
70 hero Dragon Quest NaN NaN
71 banjo and kazooie Banjo Kazooie NaN NaN

72 rows × 4 columns

In [50]:
smash_left = pd.concat((smash_df.iloc[0:2], smash_df.iloc[7:9]), axis = 0)
smash_left
Out[50]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
7 fox Star Fox 2.10 3.360 77 1.523 2.402 35.00 16.40 37.00 6 8 Expert
8 pikachu Pokemon 1.55 2.480 79 1.302 2.039 35.50 17.12 35.50 8 9 Expert
In [51]:
smash_right = pd.concat((smash_df.iloc[7:10], smash_df.iloc[50:52]), axis = 0)
smash_right
Out[51]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
7 fox Star Fox 2.10 3.360 77 1.523 2.402 35.0 16.40 37.00 6 8 Expert
8 pikachu Pokemon 1.55 2.480 79 1.302 2.039 35.5 17.12 35.50 8 9 Expert
9 luigi Super Mario 1.32 2.112 97 1.134 1.650 44.0 19.98 41.31 7 10 Expert
50 mii swordfighter Wii 1.55 2.480 100 1.070 1.580 28.3 12.30 28.30 8 7 Expert
51 mii gunner Wii 1.45 2.320 104 1.000 1.370 30.9 14.60 32.30 7 8 Expert

Left join

In [52]:
smash_merge_1 = pd.merge(smash_left.iloc[:,0:3], smash_right.iloc[:, 0:3], how = "left", on = "game")
smash_merge_1
Out[52]:
character_x game regular_fall_x character_y regular_fall_y
0 mario Super Mario 1.50 luigi 1.32
1 donkey kong Donkey Kong 1.63 NaN NaN
2 fox Star Fox 2.10 fox 2.10
3 pikachu Pokemon 1.55 pikachu 1.55

Right join

In [53]:
smash_merge_2 = pd.merge(smash_left.iloc[:,0:3], smash_right.iloc[:, 0:3], how = "right", on = "game")
smash_merge_2
Out[53]:
character_x game regular_fall_x character_y regular_fall_y
0 fox Star Fox 2.10 fox 2.10
1 pikachu Pokemon 1.55 pikachu 1.55
2 mario Super Mario 1.50 luigi 1.32
3 NaN Wii NaN mii swordfighter 1.55
4 NaN Wii NaN mii gunner 1.45

Inner join

In [54]:
smash_merge_3 = pd.merge(smash_left.iloc[:,0:3], smash_right.iloc[:, 0:3], how = "inner", on = "game")
smash_merge_3
Out[54]:
character_x game regular_fall_x character_y regular_fall_y
0 mario Super Mario 1.50 luigi 1.32
1 fox Star Fox 2.10 fox 2.10
2 pikachu Pokemon 1.55 pikachu 1.55

Outer join

In [55]:
smash_merge_4 = pd.merge(smash_left.iloc[:,0:3], smash_right.iloc[:, 0:3], how = "outer", on = "game")
smash_merge_4
Out[55]:
character_x game regular_fall_x character_y regular_fall_y
0 mario Super Mario 1.50 luigi 1.32
1 donkey kong Donkey Kong 1.63 NaN NaN
2 fox Star Fox 2.10 fox 2.10
3 pikachu Pokemon 1.55 pikachu 1.55
4 NaN Wii NaN mii swordfighter 1.55
5 NaN Wii NaN mii gunner 1.45

3.6 Sampling¶

In [56]:
import numpy as np
In [57]:
smash_df.sample(5)
Out[57]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
50 mii swordfighter Wii 1.55 2.480 100 1.070 1.580 28.30 12.30 28.30 8 7 Expert
33 snake Metal Gear 1.73 2.768 106 0.882 1.595 21.62 13.69 34.07 4 5 Amateur
7 fox Star Fox 2.10 3.360 77 1.523 2.402 35.00 16.40 37.00 6 8 Expert
35 diddy kong Donkey Kong 1.75 2.800 90 1.313 2.006 41.21 19.90 41.21 10 9 Expert
In [58]:
smash_df.sample(frac = 0.1)
Out[58]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
55 bowser jr Super Mario 1.65 2.640 108 0.924 1.566 34.40 16.59 34.40 7 9 Expert
22 young link Zelda 1.80 2.880 88 1.260 1.749 33.66 16.26 33.66 7 10 Expert
27 mr game and watch Nintendo 1.24 1.984 75 1.180 1.679 27.51 13.26 27.51 3 5 Amateur
50 mii swordfighter Wii 1.55 2.480 100 1.070 1.580 28.30 12.30 28.30 8 7 Expert
35 diddy kong Donkey Kong 1.75 2.800 90 1.313 2.006 41.21 19.90 41.21 10 9 Expert
12 jigglypuff Pokemon 0.98 1.568 68 0.735 1.271 19.79 11.26 19.79 3 7 Amateur
9 luigi Super Mario 1.32 2.112 97 1.134 1.650 44.00 19.98 41.31 7 10 Expert

Oversampling. It's still a random sample 10x more likely if walk_speed >=1

In [59]:
sample_weights = [10 if walk_speed >= 1 else 1 for walk_speed in smash_df.walk_speed]
smash_df.sample(10, weights = sample_weights)
Out[59]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
37 sonic Sonic 1.65 2.640 86 1.444 3.850 35.00 16.89 35.00 10 5 Expert
63 ridley Metroid 1.78 2.848 107 1.100 2.200 34.00 14.20 32.00 2 6 Amateur
38 king dedede Kirby 1.95 3.120 127 1.029 1.496 32.85 16.02 32.85 6 5 Expert
40 lucario Pokemon 1.68 2.688 92 1.103 1.705 37.62 18.19 37.62 5 8 Amateur
29 pit Icarus 1.48 2.368 96 1.259 1.828 31.00 14.96 31.00 8 9 Expert
19 pichu Pokemon 1.90 2.500 62 1.302 1.892 36.75 17.43 36.02 6 9 Expert
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
21 marth Fire Emblem 1.58 2.528 90 1.575 1.964 33.66 16.26 33.66 3 7 Amateur
67 isabelle Animal Crossing 1.30 2.080 88 1.140 1.480 32.50 15.69 32.50 9 7 Expert
43 wolf Star Fox 1.80 2.880 92 1.208 1.540 32.02 15.38 30.71 10 10 Expert

3.7 Missing values¶

In [60]:
smash_df["fast_fall"].isnull().values.any()
Out[60]:
False
In [61]:
smash_df["fast_fall"].isnull()
Out[61]:
0     False
1     False
2     False
3     False
4     False
      ...  
67    False
68    False
69    False
70    False
71    False
Name: fast_fall, Length: 72, dtype: bool
In [62]:
smash_df["short_hop_height"].isnull().values.any()
Out[62]:
True
In [63]:
smash_df["short_hop_height"].isnull()
Out[63]:
0     False
1     False
2     False
3     False
4     False
      ...  
67    False
68    False
69    False
70     True
71     True
Name: short_hop_height, Length: 72, dtype: bool
In [64]:
smash_df.isnull().values.any()
Out[64]:
True
In [65]:
smash_df.isnull()
Out[65]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 False False False False False False False False False False False False False
1 False False False False False False False False False False False False False
2 False False False False False False False False False False False False False
3 False False False False False False False False False False False False False
4 False False False False False False False False False False False False False
... ... ... ... ... ... ... ... ... ... ... ... ... ...
67 False False False False False False False False False False False False False
68 False False False False False False False False False False False False False
69 False False False False False False False False False False False False False
70 False False False False False False False True True True False False False
71 False False False False False False False True True True False False False

72 rows × 13 columns

Retrieve the rows with NAs.

In [66]:
smash_df.loc[smash_df.isnull().any(axis = 1)]
Out[66]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
70 hero Dragon Quest 1.57 2.512 101 0.98 1.84 NaN NaN NaN 6 7 Expert
71 banjo and kazooie Banjo Kazooie 1.76 2.816 106 1.06 2.18 NaN NaN NaN 7 8 Expert

Remove rows with missing values.

In [67]:
smash_df_complete = smash_df.dropna()
In [68]:
smash_df_complete.shape
Out[68]:
(70, 13)

4. Recoding¶

4.1 Use a list for categorical variables¶

Use copy() to make a copy of the df

In [69]:
smash_recoding_df = smash_df.copy()
old_values = ["Amateur", "Expert"]
new_values = ["Blah", "Yay"]
smash_recoding_df["expertise"] = smash_recoding_df["expertise"].replace(old_values, new_values)
smash_recoding_df.head()
Out[69]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Blah
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Yay
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Yay
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Yay
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Blah

4.2 Use a function for numerical variables¶

In [70]:
def weight_groups_fn(weight):
    if weight < 88:
        return "light weight"
    elif 88 <= weight < 104.5:
        return "medium weight"
    elif 104.5 <= weight:
        return "heavy weight"

smash_recoding_df["weight_group"] = smash_recoding_df["weight"].apply(weight_groups_fn)
smash_recoding_df["weight_group"].value_counts(sort = True)
Out[70]:
medium weight    37
heavy weight     18
light weight     17
Name: weight_group, dtype: int64
In [71]:
smash_recoding_df.head()
Out[71]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise weight_group
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Blah medium weight
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Yay heavy weight
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Yay medium weight
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Yay heavy weight
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Blah heavy weight

4.3 Binary variables¶

In [72]:
smash_w_dummies_df = pd.get_dummies(smash_recoding_df, prefix_sep = '_dummy_', drop_first = True)
smash_w_dummies_df.head(5)
Out[72]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after ... game_dummy_Stack Up game_dummy_Star Fox game_dummy_Street Fighter game_dummy_Super Mario game_dummy_Wii game_dummy_Xenoblade game_dummy_Zelda expertise_dummy_Yay weight_group_dummy_light weight weight_group_dummy_medium weight
0 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 ... 0 0 0 1 0 0 0 0 0 1
1 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 ... 0 0 0 0 0 0 0 1 0 0
2 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 ... 0 0 0 0 0 0 1 1 0 1
3 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 ... 0 0 0 0 0 0 0 1 0 0
4 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 113 columns

4.4 Missing values¶

Create missing values

In [73]:
import numpy as np
smash_w_missing_df = smash_recoding_df.copy()
missing_rows = smash_w_missing_df.sample(5).index # sample 5 rows to create missing values for illustration
smash_w_missing_df.loc[missing_rows, "regular_fall"] = np.nan # set those 5 samples rows  to have missing values in "regular_fall"
print("Number of rows with valid regular_fall values after setting to NAN: ",
      smash_w_missing_df["regular_fall"].count()) # or just simply smash_w_missing_df["regular_fall"].count()
Number of rows with valid regular_fall values after setting to NAN:  67

Replace the missing values using the median of the remaining values

In [74]:
median_regular_fall = smash_w_missing_df["regular_fall"].median() # compute the median
smash_w_missing_df["regular_fall"] = smash_w_missing_df["regular_fall"].fillna(value = median_regular_fall) # fill in the value
print('Number of rows with valid regular_fall values after filling NA values: ',
      smash_w_missing_df["regular_fall"].count()) # or just simply smash_w_missing_df["regular_fall"].count()
Number of rows with valid regular_fall values after filling NA values:  72

4.5 Normalisation¶

In [75]:
from sklearn import preprocessing
In [76]:
smash_df.shape
Out[76]:
(72, 13)

Get numerical columns for normalisation

In [77]:
smash_norm_df = smash_df.select_dtypes(include = np.number)
smash_norm_df
Out[77]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
0 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8
1 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10
2 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7
3 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9
4 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5
... ... ... ... ... ... ... ... ... ... ...
67 1.30 2.080 88 1.140 1.480 32.50 15.69 32.50 9 7
68 1.76 2.816 116 0.620 1.180 31.60 14.40 32.80 7 6
69 1.95 2.730 112 0.760 1.720 37.40 17.50 38.52 9 7
70 1.57 2.512 101 0.980 1.840 NaN NaN NaN 6 7
71 1.76 2.816 106 1.060 2.180 NaN NaN NaN 7 8

72 rows × 10 columns

If numerical columns were not selected:

smash_norm_df = pd.DataFrame(scaler.fit_transform(smash_norm_df.select_dtypes(include = np.number)), index = smash_norm_df.index, columns = smash_norm_df.columns)

In [78]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
smash_norm_df = pd.DataFrame(scaler.fit_transform(smash_norm_df),
                             index = smash_norm_df.index,
                             columns = smash_norm_df.columns)
smash_norm_df
Out[78]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
0 -0.452156 -0.416400 0.126490 0.239502 -0.119401 0.614298 0.661683 0.298418 -1.294201 0.177443
1 0.135239 0.175589 2.222609 1.228999 0.171209 0.155269 0.556693 0.167367 0.655223 1.455036
2 -0.000314 1.405105 0.560170 0.672996 -0.700620 -1.066182 -1.158132 -0.858940 1.434992 -0.461353
3 -1.220287 -1.076696 0.849290 0.051026 -0.392008 0.746294 0.862912 0.404207 -0.124547 0.816239
4 -1.220287 -1.076696 0.849290 0.051026 -0.392008 0.746294 0.862912 0.404207 -1.684085 -1.738945
... ... ... ... ... ... ... ... ... ... ...
67 -1.355840 -1.327153 -0.596310 0.168823 -0.839496 -0.140243 -0.147610 -0.306313 1.045108 -0.461353
68 0.722633 0.767578 1.427530 -2.281362 -1.611026 -0.317551 -0.711928 -0.258945 0.265338 -1.100149
69 1.581133 0.522813 1.138410 -1.621696 -0.222272 0.825097 0.644185 0.644205 1.045108 -0.461353
70 -0.135866 -0.097637 0.343330 -0.585080 0.086340 NaN NaN NaN -0.124547 -0.461353
71 0.722633 0.767578 0.704730 -0.208128 0.960741 NaN NaN NaN 0.265338 0.177443

72 rows × 10 columns

In [79]:
smash_cat_df = smash_df.select_dtypes(exclude = np.number)
smash_cat_df
Out[79]:
character game expertise
0 mario Super Mario Amateur
1 donkey kong Donkey Kong Expert
2 link Zelda Expert
3 samus Metroid Expert
4 dark samus Metroid Amateur
... ... ... ...
67 isabelle Animal Crossing Expert
68 incineroar Pokemon Expert
69 piranha plant Super Mario Expert
70 hero Dragon Quest Expert
71 banjo and kazooie Banjo Kazooie Expert

72 rows × 3 columns

In [80]:
smash_norm_combined_df = pd.concat([smash_norm_df, smash_df.select_dtypes(exclude = np.number)], axis = 1)
smash_norm_combined_df
Out[80]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after character game expertise
0 -0.452156 -0.416400 0.126490 0.239502 -0.119401 0.614298 0.661683 0.298418 -1.294201 0.177443 mario Super Mario Amateur
1 0.135239 0.175589 2.222609 1.228999 0.171209 0.155269 0.556693 0.167367 0.655223 1.455036 donkey kong Donkey Kong Expert
2 -0.000314 1.405105 0.560170 0.672996 -0.700620 -1.066182 -1.158132 -0.858940 1.434992 -0.461353 link Zelda Expert
3 -1.220287 -1.076696 0.849290 0.051026 -0.392008 0.746294 0.862912 0.404207 -0.124547 0.816239 samus Metroid Expert
4 -1.220287 -1.076696 0.849290 0.051026 -0.392008 0.746294 0.862912 0.404207 -1.684085 -1.738945 dark samus Metroid Amateur
... ... ... ... ... ... ... ... ... ... ... ... ... ...
67 -1.355840 -1.327153 -0.596310 0.168823 -0.839496 -0.140243 -0.147610 -0.306313 1.045108 -0.461353 isabelle Animal Crossing Expert
68 0.722633 0.767578 1.427530 -2.281362 -1.611026 -0.317551 -0.711928 -0.258945 0.265338 -1.100149 incineroar Pokemon Expert
69 1.581133 0.522813 1.138410 -1.621696 -0.222272 0.825097 0.644185 0.644205 1.045108 -0.461353 piranha plant Super Mario Expert
70 -0.135866 -0.097637 0.343330 -0.585080 0.086340 NaN NaN NaN -0.124547 -0.461353 hero Dragon Quest Expert
71 0.722633 0.767578 0.704730 -0.208128 0.960741 NaN NaN NaN 0.265338 0.177443 banjo and kazooie Banjo Kazooie Expert

72 rows × 13 columns

4.6 Transformation¶

Natural log transformation

In [81]:
smash_df_numeric = smash_df.iloc[:, 2:11]
smash_df_numeric.head()
Out[81]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3
1 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8
2 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10
3 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6
4 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2

Natural log

In [82]:
import numpy as np
smash_df_numeric_log = np.log(smash_df_numeric)
smash_df_numeric_log.head()
Out[82]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 0.405465 0.875469 4.584967 0.144100 0.565314 3.592644 2.864484 3.592644 1.098612
1 0.488580 0.958584 4.844187 0.311154 0.627541 3.526361 2.850707 3.569533 2.079442
2 0.470004 1.111858 4.644391 0.220741 0.427879 3.325036 2.593761 3.367296 2.302585
3 0.285179 0.773805 4.682131 0.108854 0.503197 3.610918 2.890372 3.610918 1.791759
4 0.285179 0.773805 4.682131 0.108854 0.503197 3.610918 2.890372 3.610918 0.693147

Transform back, i.e. exp()

In [83]:
smash_df_numeric_exp = np.exp(smash_df_numeric_log)
smash_df_numeric_exp.head()
Out[83]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 1.50 2.400 98.0 1.155 1.760 36.33 17.54 36.33 3.0
1 1.63 2.608 127.0 1.365 1.873 34.00 17.30 35.50 8.0
2 1.60 3.040 104.0 1.247 1.534 27.80 13.38 29.00 10.0
3 1.33 2.168 108.0 1.115 1.654 37.00 18.00 37.00 6.0
4 1.33 2.168 108.0 1.115 1.654 37.00 18.00 37.00 2.0

For base 10, use log10()

In [84]:
smash_df_numeric_log10 = np.log10(smash_df_numeric)
smash_df_numeric_log10.head()
Out[84]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 0.176091 0.380211 1.991226 0.062582 0.245513 1.560265 1.244030 1.560265 0.477121
1 0.212188 0.416308 2.103804 0.135133 0.272538 1.531479 1.238046 1.550228 0.903090
2 0.204120 0.482874 2.017033 0.095866 0.185825 1.444045 1.126456 1.462398 1.000000
3 0.123852 0.336059 2.033424 0.047275 0.218536 1.568202 1.255273 1.568202 0.778151
4 0.123852 0.336059 2.033424 0.047275 0.218536 1.568202 1.255273 1.568202 0.301030

Transform back using 10^x

In [85]:
def exp10(x):
    return 10**x
 
# execute the function
smash_df_numeric_log10_exp10 = smash_df_numeric_log10.apply(exp10)
smash_df_numeric_log10_exp10.head()
Out[85]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 1.50 2.400 98.0 1.155 1.760 36.33 17.54 36.33 3.0
1 1.63 2.608 127.0 1.365 1.873 34.00 17.30 35.50 8.0
2 1.60 3.040 104.0 1.247 1.534 27.80 13.38 29.00 10.0
3 1.33 2.168 108.0 1.115 1.654 37.00 18.00 37.00 6.0
4 1.33 2.168 108.0 1.115 1.654 37.00 18.00 37.00 2.0

Multiplication and division

In [86]:
smash_df_numeric_2 = smash_df.iloc[:, 2:11]
smash_df_numeric_2.head()
Out[86]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3
1 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8
2 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10
3 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6
4 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2

Multiplication

In [87]:
def multiply_by_5(x):
    return 5 * x
 
# execute the function
smash_df_numeric_2_x5 = smash_df_numeric_2.apply(multiply_by_5)
 
# displaying the DataFrame

smash_df_numeric_2_x5.head()
Out[87]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 7.50 12.00 490 5.775 8.800 181.65 87.7 181.65 15
1 8.15 13.04 635 6.825 9.365 170.00 86.5 177.50 40
2 8.00 15.20 520 6.235 7.670 139.00 66.9 145.00 50
3 6.65 10.84 540 5.575 8.270 185.00 90.0 185.00 30
4 6.65 10.84 540 5.575 8.270 185.00 90.0 185.00 10

Division, Round where applicable (if preferred)

In [88]:
def divide_by_5(x):
    return x / 5
 
# execute the function
smash_df_numeric_2_divide_5 = round(smash_df_numeric_2_x5.apply(divide_by_5),2)
 
# displaying the DataFrame

smash_df_numeric_2_divide_5.head()
Out[88]:
regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before
0 1.50 2.40 98.0 1.16 1.76 36.33 17.54 36.33 3.0
1 1.63 2.61 127.0 1.36 1.87 34.00 17.30 35.50 8.0
2 1.60 3.04 104.0 1.25 1.53 27.80 13.38 29.00 10.0
3 1.33 2.17 108.0 1.12 1.65 37.00 18.00 37.00 6.0
4 1.33 2.17 108.0 1.12 1.65 37.00 18.00 37.00 2.0

5. Pivot Table¶

In [89]:
pivot_1 = smash_df.value_counts("expertise")
pivot_1
Out[89]:
expertise
Expert     45
Amateur    27
dtype: int64
In [90]:
pivot_2 = pd.pivot_table(smash_df, values = "regular_fall", index = "game", aggfunc = "mean")
pivot_2
Out[90]:
regular_fall
game
Animal Crossing 1.310000
Banjo Kazooie 1.760000
Bayonetta 1.770000
Castlevania 1.850000
Donkey Kong 1.693333
Dragon Quest 1.570000
Duck Hunt 1.650000
Earthbound 1.310000
F-Zero 1.832500
Final Fantasy 1.680000
Fire Emblem 1.663333
Icarus 1.515000
Kirby 1.580000
Mega Man 1.800000
Metal Gear 1.730000
Metroid 1.535000
Mother 1.370000
Nintendo 1.240000
Pikmin 1.350000
Pokemon 1.610000
Punch Out 1.950000
Sonic 1.650000
Splatoon 1.580000
Stack Up 1.600000
Star Fox 1.950000
Street Fighter 1.600000
Super Mario 1.497000
Wii 1.552500
Xenoblade 1.580000
Zelda 1.588333
In [91]:
pivot_3 = pd.pivot_table(smash_df, values = "regular_fall", columns = "expertise", index = "game", aggfunc = "mean")
pivot_3
Out[91]:
expertise Amateur Expert
game
Animal Crossing NaN 1.310000
Banjo Kazooie NaN 1.760000
Bayonetta NaN 1.770000
Castlevania 1.8500 1.850000
Donkey Kong 1.7000 1.690000
Dragon Quest NaN 1.570000
Duck Hunt NaN 1.650000
Earthbound 1.3100 NaN
F-Zero 1.8000 1.865000
Final Fantasy NaN 1.680000
Fire Emblem 1.6325 1.725000
Icarus NaN 1.515000
Kirby 1.3550 1.805000
Mega Man NaN 1.800000
Metal Gear 1.7300 NaN
Metroid 1.5550 1.515000
Mother NaN 1.370000
Nintendo 1.2400 NaN
Pikmin 1.3500 NaN
Pokemon 1.3300 1.722000
Punch Out NaN 1.950000
Sonic NaN 1.650000
Splatoon 1.5800 NaN
Stack Up NaN 1.600000
Star Fox NaN 1.950000
Street Fighter 1.6000 1.600000
Super Mario 1.4720 1.522000
Wii 1.9100 1.433333
Xenoblade NaN 1.580000
Zelda 1.3650 1.700000
In [92]:
pivot_4 = pd.pivot_table(smash_df, values = "regular_fall", index = "expertise", aggfunc = "median")
pivot_4
Out[92]:
regular_fall
expertise
Amateur 1.58
Expert 1.65
In [93]:
pivot_5 = pd.pivot_table(smash_df, values = ["regular_fall", "fast_fall"], index = "expertise", aggfunc = "median")
pivot_5
Out[93]:
fast_fall regular_fall
expertise
Amateur 2.40 1.58
Expert 2.64 1.65
In [94]:
pivot_6 = pd.pivot_table(smash_df, values = "regular_fall", index = ["expertise", "game"], aggfunc = "median")
pivot_6
Out[94]:
regular_fall
expertise game
Amateur Castlevania 1.850
Donkey Kong 1.700
Earthbound 1.310
F-Zero 1.800
Fire Emblem 1.615
Kirby 1.355
Metal Gear 1.730
Metroid 1.555
Nintendo 1.240
Pikmin 1.350
Pokemon 1.330
Splatoon 1.580
Street Fighter 1.600
Super Mario 1.500
Wii 1.910
Zelda 1.365
Expert Animal Crossing 1.310
Banjo Kazooie 1.760
Bayonetta 1.770
Castlevania 1.850
Donkey Kong 1.690
Dragon Quest 1.570
Duck Hunt 1.650
F-Zero 1.865
Final Fantasy 1.680
Fire Emblem 1.725
Icarus 1.515
Kirby 1.805
Mega Man 1.800
Metroid 1.515
Mother 1.370
Pokemon 1.760
Punch Out 1.950
Sonic 1.650
Stack Up 1.600
Star Fox 1.950
Street Fighter 1.600
Super Mario 1.500
Wii 1.450
Xenoblade 1.580
Zelda 1.700
In [95]:
pivot_7 = pd.pivot_table(smash_df, values = "regular_fall", columns = ["expertise", "game"], aggfunc = "median")
pivot_7
Out[95]:
expertise Amateur ... Expert
game Castlevania Donkey Kong Earthbound F-Zero Fire Emblem Kirby Metal Gear Metroid Nintendo Pikmin ... Pokemon Punch Out Sonic Stack Up Star Fox Street Fighter Super Mario Wii Xenoblade Zelda
regular_fall 1.85 1.7 1.31 1.8 1.615 1.355 1.73 1.555 1.24 1.35 ... 1.76 1.95 1.65 1.6 1.95 1.6 1.5 1.45 1.58 1.7

1 rows × 41 columns

In [96]:
pivot_8 = pd.pivot_table(smash_df, values = ["regular_fall", "fast_fall"], index = ["expertise", "game"], aggfunc = "median")
pivot_8

# to export
# pivot_8.to_csv("pivot_8.csv")
Out[96]:
fast_fall regular_fall
expertise game
Amateur Castlevania 2.960 1.850
Donkey Kong 2.720 1.700
Earthbound 2.096 1.310
F-Zero 2.880 1.800
Fire Emblem 2.584 1.615
Kirby 2.168 1.355
Metal Gear 2.768 1.730
Metroid 2.508 1.555
Nintendo 1.984 1.240
Pikmin 2.160 1.350
Pokemon 2.128 1.330
Splatoon 2.528 1.580
Street Fighter 2.240 1.600
Super Mario 2.400 1.500
Wii 3.072 1.910
Zelda 2.184 1.365
Expert Animal Crossing 2.096 1.310
Banjo Kazooie 2.816 1.760
Bayonetta 2.832 1.770
Castlevania 2.960 1.850
Donkey Kong 2.704 1.690
Dragon Quest 2.512 1.570
Duck Hunt 2.640 1.650
F-Zero 2.984 1.865
Final Fantasy 2.688 1.680
Fire Emblem 2.760 1.725
Icarus 2.424 1.515
Kirby 2.888 1.805
Mega Man 2.880 1.800
Metroid 2.444 1.515
Mother 2.192 1.370
Pokemon 2.500 1.760
Punch Out 3.120 1.950
Sonic 2.640 1.650
Stack Up 2.560 1.600
Star Fox 3.120 1.950
Street Fighter 2.240 1.600
Super Mario 2.400 1.500
Wii 2.320 1.450
Xenoblade 2.528 1.580
Zelda 2.840 1.700
In [97]:
pivot_9 = pd.pivot_table(smash_df, values = ["regular_fall", "fast_fall"], 
                         index = ["expertise", "game"], aggfunc = ["median", "mean", "max"])
pivot_9
Out[97]:
median mean max
fast_fall regular_fall fast_fall regular_fall fast_fall regular_fall
expertise game
Amateur Castlevania 2.960 1.850 2.960000 1.850000 2.960 1.850
Donkey Kong 2.720 1.700 2.720000 1.700000 2.720 1.700
Earthbound 2.096 1.310 2.096000 1.310000 2.096 1.310
F-Zero 2.880 1.800 2.880000 1.800000 2.880 1.800
Fire Emblem 2.584 1.615 2.612000 1.632500 2.880 1.800
Kirby 2.168 1.355 2.168000 1.355000 2.368 1.480
Metal Gear 2.768 1.730 2.768000 1.730000 2.768 1.730
Metroid 2.508 1.555 2.508000 1.555000 2.848 1.780
Nintendo 1.984 1.240 1.984000 1.240000 1.984 1.240
Pikmin 2.160 1.350 2.160000 1.350000 2.160 1.350
Pokemon 2.128 1.330 2.128000 1.330000 2.688 1.680
Splatoon 2.528 1.580 2.528000 1.580000 2.528 1.580
Street Fighter 2.240 1.600 2.240000 1.600000 2.240 1.600
Super Mario 2.400 1.500 2.355200 1.472000 2.832 1.770
Wii 3.072 1.910 3.072000 1.910000 3.072 1.910
Zelda 2.184 1.365 2.184000 1.365000 2.208 1.380
Expert Animal Crossing 2.096 1.310 2.096000 1.310000 2.112 1.320
Banjo Kazooie 2.816 1.760 2.816000 1.760000 2.816 1.760
Bayonetta 2.832 1.770 2.832000 1.770000 2.832 1.770
Castlevania 2.960 1.850 2.960000 1.850000 2.960 1.850
Donkey Kong 2.704 1.690 2.704000 1.690000 2.800 1.750
Dragon Quest 2.512 1.570 2.512000 1.570000 2.512 1.570
Duck Hunt 2.640 1.650 2.640000 1.650000 2.640 1.650
F-Zero 2.984 1.865 2.984000 1.865000 2.984 1.865
Final Fantasy 2.688 1.680 2.688000 1.680000 2.688 1.680
Fire Emblem 2.760 1.725 2.760000 1.725000 2.880 1.800
Icarus 2.424 1.515 2.424000 1.515000 2.480 1.550
Kirby 2.888 1.805 2.888000 1.805000 3.120 1.950
Mega Man 2.880 1.800 2.880000 1.800000 2.880 1.800
Metroid 2.444 1.515 2.444000 1.515000 2.720 1.700
Mother 2.192 1.370 2.192000 1.370000 2.192 1.370
Pokemon 2.500 1.760 2.647200 1.722000 2.960 1.900
Punch Out 3.120 1.950 3.120000 1.950000 3.120 1.950
Sonic 2.640 1.650 2.640000 1.650000 2.640 1.650
Stack Up 2.560 1.600 2.560000 1.600000 2.560 1.600
Star Fox 3.120 1.950 3.120000 1.950000 3.360 2.100
Street Fighter 2.240 1.600 2.240000 1.600000 2.240 1.600
Super Mario 2.400 1.500 2.357200 1.522000 2.730 1.950
Wii 2.320 1.450 2.293333 1.433333 2.480 1.550
Xenoblade 2.528 1.580 2.528000 1.580000 2.528 1.580
Zelda 2.840 1.700 2.840000 1.700000 3.040 1.800
In [98]:
pivot_4.plot.bar(figsize = (10,5), title = "Median Regular Fall by Expertise")
Out[98]:
<AxesSubplot:title={'center':'Median Regular Fall by Expertise'}, xlabel='expertise'>
In [99]:
pivot_5.plot.barh(figsize = (10,5), title = "Median Regular Fall by Expertise")
Out[99]:
<AxesSubplot:title={'center':'Median Regular Fall by Expertise'}, ylabel='expertise'>

6. Training Validation Split¶

In [100]:
from sklearn.model_selection import train_test_split

Make a copy using copy()

In [101]:
smash_split_df = smash_df.copy()
In [102]:
X = smash_split_df.drop(columns = ["expertise"])
y = smash_split_df["expertise"].astype("category")
In [103]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.3, random_state = 666)
In [104]:
train_X.head()
Out[104]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
64 simon Castlevania 1.85 2.960 107 0.760 1.520 30.00 18.00 29.00 3 9
22 young link Zelda 1.80 2.880 88 1.260 1.749 33.66 16.26 33.66 7 10
38 king dedede Kirby 1.95 3.120 127 1.029 1.496 32.85 16.02 32.85 6 5
25 roy Fire Emblem 1.80 2.880 95 1.208 2.145 30.97 13.00 28.00 3 6
71 banjo and kazooie Banjo Kazooie 1.76 2.816 106 1.060 2.180 NaN NaN NaN 7 8
In [105]:
valid_X.head()
Out[105]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after
37 sonic Sonic 1.65 2.640 86 1.444 3.850 35.00 16.89 35.00 10 5
68 incineroar Pokemon 1.76 2.816 116 0.620 1.180 31.60 14.40 32.80 7 6
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10
52 palutena Icarus 1.55 2.480 91 1.271 2.077 35.90 17.30 35.90 10 5
17 zelda Zelda 1.35 2.160 85 0.914 1.430 31.55 15.24 31.55 4 7
In [106]:
train_y.head()
Out[106]:
64    Amateur
22     Expert
38     Expert
25    Amateur
71     Expert
Name: expertise, dtype: category
Categories (2, object): ['Amateur', 'Expert']
In [107]:
valid_y.head()
Out[107]:
37     Expert
68     Expert
1      Expert
52     Expert
17    Amateur
Name: expertise, dtype: category
Categories (2, object): ['Amateur', 'Expert']
In [108]:
len(train_X)
Out[108]:
50
In [109]:
len(train_y)
Out[109]:
50
In [110]:
len(valid_X)
Out[110]:
22
In [111]:
len(valid_y)
Out[111]:
22

7. Normalisation Based on Training Set¶

In [112]:
smash_df.head()
Out[112]:
character game regular_fall fast_fall weight walk_speed run_speed FULL_HOP_HEIGHT_RENAME short_hop_height double_hop_height before after expertise
0 mario Super Mario 1.50 2.400 98 1.155 1.760 36.33 17.54 36.33 3 8 Amateur
1 donkey kong Donkey Kong 1.63 2.608 127 1.365 1.873 34.00 17.30 35.50 8 10 Expert
2 link Zelda 1.60 3.040 104 1.247 1.534 27.80 13.38 29.00 10 7 Expert
3 samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 6 9 Expert
4 dark samus Metroid 1.33 2.168 108 1.115 1.654 37.00 18.00 37.00 2 5 Amateur
In [113]:
pd.DataFrame(smash_df.columns.values, columns = ["Variables"])
Out[113]:
Variables
0 character
1 game
2 regular_fall
3 fast_fall
4 weight
5 walk_speed
6 run_speed
7 FULL_HOP_HEIGHT_RENAME
8 short_hop_height
9 double_hop_height
10 before
11 after
12 expertise
In [114]:
smash_df.dtypes
Out[114]:
character                   object
game                      category
regular_fall               float64
fast_fall                  float64
weight                       int64
walk_speed                 float64
run_speed                  float64
FULL_HOP_HEIGHT_RENAME     float64
short_hop_height           float64
double_hop_height          float64
before                       int64
after                        int64
expertise                 category
dtype: object
In [115]:
smash_df_7 = smash_df.iloc[:,[1, 2, 3, 4, 12]]
smash_df_7.head()
Out[115]:
game regular_fall fast_fall weight expertise
0 Super Mario 1.50 2.400 98 Amateur
1 Donkey Kong 1.63 2.608 127 Expert
2 Zelda 1.60 3.040 104 Expert
3 Metroid 1.33 2.168 108 Expert
4 Metroid 1.33 2.168 108 Amateur

Get dummies. Exclude outcome variable because it is categorical. But since there are only 2 classes, it can also be converted to a dummy and used as the outcome.

In [116]:
smash_df_7_predictors = pd.get_dummies(smash_df_7.drop(columns = ["expertise"]), drop_first = False)
smash_df_7_predictors.head()
Out[116]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Punch Out game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda
0 1.50 2.400 98 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 1.63 2.608 127 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1.60 3.040 104 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 1.33 2.168 108 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1.33 2.168 108 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 33 columns

Combine with outcome variable.

In [117]:
smash_df_7_full = pd.concat([smash_df_7_predictors, smash_df_7["expertise"]], axis = 1)
smash_df_7_full.head()
Out[117]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda expertise
0 1.50 2.400 98 0 0 0 0 0 0 0 ... 0 0 0 0 0 1 0 0 0 Amateur
1 1.63 2.608 127 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
2 1.60 3.040 104 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 Expert
3 1.33 2.168 108 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
4 1.33 2.168 108 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Amateur

5 rows × 34 columns

In [118]:
smash_df_7_full.columns.values
Out[118]:
array(['regular_fall', 'fast_fall', 'weight', 'game_Animal Crossing',
       'game_Banjo Kazooie', 'game_Bayonetta', 'game_Castlevania',
       'game_Donkey Kong', 'game_Dragon Quest', 'game_Duck Hunt',
       'game_Earthbound', 'game_F-Zero', 'game_Final Fantasy',
       'game_Fire Emblem', 'game_Icarus', 'game_Kirby', 'game_Mega Man',
       'game_Metal Gear', 'game_Metroid', 'game_Mother', 'game_Nintendo',
       'game_Pikmin', 'game_Pokemon', 'game_Punch Out', 'game_Sonic',
       'game_Splatoon', 'game_Stack Up', 'game_Star Fox',
       'game_Street Fighter', 'game_Super Mario', 'game_Wii',
       'game_Xenoblade', 'game_Zelda', 'expertise'], dtype=object)
In [119]:
predictors = ['regular_fall', 'fast_fall', 'weight', 'game_Animal Crossing',
              'game_Banjo Kazooie', 'game_Bayonetta', 'game_Castlevania',
              'game_Donkey Kong', 'game_Dragon Quest', 'game_Duck Hunt',
              'game_Earthbound', 'game_F-Zero', 'game_Final Fantasy',
              'game_Fire Emblem', 'game_Icarus', 'game_Kirby', 'game_Mega Man',
              'game_Metal Gear', 'game_Metroid', 'game_Mother', 'game_Nintendo',
              'game_Pikmin', 'game_Pokemon', 'game_Punch Out', 'game_Sonic',
              'game_Splatoon', 'game_Stack Up', 'game_Star Fox',
              'game_Street Fighter', 'game_Super Mario', 'game_Wii',
              'game_Xenoblade', 'game_Zelda']
predictors_numerical = ['regular_fall', 'fast_fall', 'weight']
predictors_categorical = ['game_Animal Crossing','game_Banjo Kazooie', 'game_Bayonetta', 'game_Castlevania',
                          'game_Donkey Kong', 'game_Dragon Quest', 'game_Duck Hunt',
                          'game_Earthbound', 'game_F-Zero', 'game_Final Fantasy',
                          'game_Fire Emblem', 'game_Icarus', 'game_Kirby', 'game_Mega Man',
                          'game_Metal Gear', 'game_Metroid', 'game_Mother', 'game_Nintendo',
                          'game_Pikmin', 'game_Pokemon', 'game_Punch Out', 'game_Sonic',
                          'game_Splatoon', 'game_Stack Up', 'game_Star Fox',
                          'game_Street Fighter', 'game_Super Mario', 'game_Wii',
                          'game_Xenoblade', 'game_Zelda',]
outcome = "expertise"
In [120]:
train_data, valid_data = train_test_split(smash_df_7_full, test_size = 0.3, random_state = 666)
In [121]:
train_data.head()
Out[121]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda expertise
64 1.85 2.960 107 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 Amateur
22 1.80 2.880 88 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 Expert
38 1.95 3.120 127 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
25 1.80 2.880 95 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Amateur
71 1.76 2.816 106 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Expert

5 rows × 34 columns

In [122]:
valid_data.head()
Out[122]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda expertise
37 1.65 2.640 86 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 Expert
68 1.76 2.816 116 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
1 1.63 2.608 127 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
52 1.55 2.480 91 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 Expert
17 1.35 2.160 85 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 Amateur

5 rows × 34 columns

In [123]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(train_data[predictors_numerical])
Out[123]:
StandardScaler()
In [124]:
train_df_norm = pd.DataFrame(scaler.transform(train_data[predictors_numerical]), columns = predictors_numerical)
train_df_norm.head()
Out[124]:
regular_fall fast_fall weight
0 1.060291 1.114411 0.822010
1 0.843285 0.894801 -0.515164
2 1.494303 1.553631 2.229562
3 0.843285 0.894801 -0.022521
4 0.669681 0.719113 0.751633
In [125]:
# Ensure the indices are similar

train_df_norm.index = train_data.index
In [126]:
train_X = pd.concat([train_df_norm, train_data[predictors_categorical]], axis = 1)
train_X.head()
Out[126]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Punch Out game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda
64 1.060291 1.114411 0.822010 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 0.843285 0.894801 -0.515164 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
38 1.494303 1.553631 2.229562 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 0.843285 0.894801 -0.022521 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
71 0.669681 0.719113 0.751633 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 33 columns

In [127]:
valid_df_norm = pd.DataFrame(scaler.transform(valid_data[predictors_numerical]), columns = predictors_numerical)
valid_df_norm.head()
Out[127]:
regular_fall fast_fall weight
0 0.192267 0.235971 -0.655919
1 0.669681 0.719113 1.455409
2 0.105465 0.148127 2.229562
3 -0.241745 -0.203249 -0.304031
4 -1.109769 -1.081689 -0.726297
In [128]:
# Ensure the indices are similar

valid_df_norm.index = valid_data.index
In [129]:
valid_X = pd.concat([valid_df_norm, valid_data[predictors_categorical]], axis = 1)
valid_X.head()
Out[129]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Punch Out game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda
37 0.192267 0.235971 -0.655919 0 0 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
68 0.669681 0.719113 1.455409 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0.105465 0.148127 2.229562 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
52 -0.241745 -0.203249 -0.304031 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 -1.109769 -1.081689 -0.726297 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 33 columns

In [130]:
train_y = train_data[outcome].astype("category")
train_y.head()
Out[130]:
64    Amateur
22     Expert
38     Expert
25    Amateur
71     Expert
Name: expertise, dtype: category
Categories (2, object): ['Amateur', 'Expert']
In [131]:
valid_y = valid_data[outcome].astype("category")
valid_y.head()
Out[131]:
37     Expert
68     Expert
1      Expert
52     Expert
17    Amateur
Name: expertise, dtype: category
Categories (2, object): ['Amateur', 'Expert']

Creating dummies in new data with less levels.

In [132]:
data = {"regular_fall": [1.8, 2.3],
        "fast_fall": [1.7, 2.0],
        "weight" : [88, 75],
        "game": ["Super Mario", "Zelda"]}
 
# Create DataFrame
new_df = pd.DataFrame(data)
new_df
Out[132]:
regular_fall fast_fall weight game
0 1.8 1.7 88 Super Mario
1 2.3 2.0 75 Zelda

Get dummies

In [133]:
new_df_dummies = pd.get_dummies(new_df)
new_df_dummies
Out[133]:
regular_fall fast_fall weight game_Super Mario game_Zelda
0 1.8 1.7 88 1 0
1 2.3 2.0 75 0 1

Get additional dummy columns

In [134]:
new_df_dummies_full = new_df_dummies.reindex(columns = train_X.columns, fill_value = 0)
new_df_dummies_full
Out[134]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Punch Out game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda
0 1.8 1.7 88 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 2.3 2.0 75 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

2 rows × 33 columns

In [135]:
pd.DataFrame(new_df_dummies_full.columns.values, columns = ["Variables"])
Out[135]:
Variables
0 regular_fall
1 fast_fall
2 weight
3 game_Animal Crossing
4 game_Banjo Kazooie
5 game_Bayonetta
6 game_Castlevania
7 game_Donkey Kong
8 game_Dragon Quest
9 game_Duck Hunt
10 game_Earthbound
11 game_F-Zero
12 game_Final Fantasy
13 game_Fire Emblem
14 game_Icarus
15 game_Kirby
16 game_Mega Man
17 game_Metal Gear
18 game_Metroid
19 game_Mother
20 game_Nintendo
21 game_Pikmin
22 game_Pokemon
23 game_Punch Out
24 game_Sonic
25 game_Splatoon
26 game_Stack Up
27 game_Star Fox
28 game_Street Fighter
29 game_Super Mario
30 game_Wii
31 game_Xenoblade
32 game_Zelda

Normalise

In [136]:
new_df_dummies_full_norm = pd.DataFrame(scaler.transform(new_df_dummies_full[predictors_numerical]), 
                                        columns = predictors_numerical)
new_df_dummies_full_norm.head()
Out[136]:
regular_fall fast_fall weight
0 0.843285 -2.344446 -0.515164
1 3.013346 -1.520909 -1.430073

Assembling the final nornalised new data with the full range of dummy columns

In [137]:
new_df_dummies_full_norm_2 = pd.concat([new_df_dummies_full_norm, new_df_dummies_full[predictors_categorical]], 
                                       axis = 1)
new_df_dummies_full_norm_2.head()
Out[137]:
regular_fall fast_fall weight game_Animal Crossing game_Banjo Kazooie game_Bayonetta game_Castlevania game_Donkey Kong game_Dragon Quest game_Duck Hunt ... game_Punch Out game_Sonic game_Splatoon game_Stack Up game_Star Fox game_Street Fighter game_Super Mario game_Wii game_Xenoblade game_Zelda
0 0.843285 -2.344446 -0.515164 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
1 3.013346 -1.520909 -1.430073 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

2 rows × 33 columns

In [138]:
pd.DataFrame(new_df_dummies_full_norm_2.columns.values, columns = ["Variables"])
Out[138]:
Variables
0 regular_fall
1 fast_fall
2 weight
3 game_Animal Crossing
4 game_Banjo Kazooie
5 game_Bayonetta
6 game_Castlevania
7 game_Donkey Kong
8 game_Dragon Quest
9 game_Duck Hunt
10 game_Earthbound
11 game_F-Zero
12 game_Final Fantasy
13 game_Fire Emblem
14 game_Icarus
15 game_Kirby
16 game_Mega Man
17 game_Metal Gear
18 game_Metroid
19 game_Mother
20 game_Nintendo
21 game_Pikmin
22 game_Pokemon
23 game_Punch Out
24 game_Sonic
25 game_Splatoon
26 game_Stack Up
27 game_Star Fox
28 game_Street Fighter
29 game_Super Mario
30 game_Wii
31 game_Xenoblade
32 game_Zelda

characters.jpg