2 - 5) EDA - Player information

60 minute read

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 
import missingno as msno 
from utils import my_histogram, pca_results

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
---------------------------------------------------------------------------

ImportError                               Traceback (most recent call last)

Cell In[50], line 7
      5 import seaborn as sns 
      6 import missingno as msno 
----> 7 from utils import my_histogram, pca_results
      9 from sklearn.experimental import enable_iterative_imputer
     10 from sklearn.impute import IterativeImputer


ImportError: cannot import name 'pca_results' from 'utils' (/Users/youngjun/Jun/projects/european_soccer_match_prediction/src/utils.py)
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
df_player = pd.read_csv("../data/df_player.csv")
df_player
player_api_id player_name birthday height weight player_measured_date overall_rating potential preferred_foot attacking_work_rate defensive_work_rate crossing finishing heading_accuracy short_passing volleys dribbling curve free_kick_accuracy long_passing ball_control acceleration sprint_speed agility reactions balance shot_power jumping stamina strength long_shots aggression interceptions positioning vision penalties marking standing_tackle sliding_tackle gk_diving gk_handling gk_kicking gk_positioning gk_reflexes position_X_mode position_Y_mode position_role
0 505942 Aaron Appindangoye 1992-02-29 182.88 187 2016-02-18 67.0 71.0 right medium medium 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0 6.0 3.0 defender
1 505942 Aaron Appindangoye 1992-02-29 182.88 187 2015-11-19 67.0 71.0 right medium medium 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0 6.0 3.0 defender
2 505942 Aaron Appindangoye 1992-02-29 182.88 187 2015-09-21 62.0 66.0 right medium medium 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 63.0 41.0 45.0 54.0 48.0 65.0 66.0 69.0 6.0 11.0 10.0 8.0 8.0 6.0 3.0 defender
3 505942 Aaron Appindangoye 1992-02-29 182.88 187 2015-03-20 61.0 65.0 right medium medium 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0 6.0 3.0 defender
4 505942 Aaron Appindangoye 1992-02-29 182.88 187 2007-02-22 61.0 65.0 right medium medium 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0 6.0 3.0 defender
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183973 39902 Zvjezdan Misimovic 1982-06-05 180.34 176 2009-08-30 83.0 85.0 right medium low 84.0 77.0 59.0 89.0 77.0 84.0 86.0 78.0 84.0 85.0 66.0 72.0 77.0 86.0 73.0 76.0 58.0 72.0 67.0 81.0 56.0 78.0 86.0 88.0 83.0 22.0 31.0 30.0 9.0 20.0 84.0 20.0 20.0 5.0 9.0 forward
183974 39902 Zvjezdan Misimovic 1982-06-05 180.34 176 2009-02-22 78.0 80.0 right medium low 74.0 76.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 61.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0 5.0 9.0 forward
183975 39902 Zvjezdan Misimovic 1982-06-05 180.34 176 2008-08-30 77.0 80.0 right medium low 74.0 71.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 67.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0 5.0 9.0 forward
183976 39902 Zvjezdan Misimovic 1982-06-05 180.34 176 2007-08-30 78.0 81.0 right medium low 74.0 64.0 57.0 86.0 77.0 87.0 86.0 73.0 73.0 91.0 61.0 60.0 77.0 69.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 28.0 32.0 30.0 9.0 20.0 73.0 20.0 20.0 5.0 9.0 forward
183977 39902 Zvjezdan Misimovic 1982-06-05 180.34 176 2007-02-22 80.0 81.0 right medium low 74.0 68.0 57.0 88.0 77.0 87.0 86.0 53.0 78.0 91.0 58.0 64.0 77.0 66.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 38.0 32.0 30.0 9.0 9.0 78.0 7.0 15.0 5.0 9.0 forward

183978 rows × 47 columns

player_num_attr = ["player_api_id", "player_measured_date", "height", "weight",
                   'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy',
                   'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
                   'long_passing', 'ball_control', 'acceleration', 'sprint_speed', 'agility', 
                   'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
                   'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
                   'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
                   'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning','gk_reflexes']
df_player_num_attr = df_player[player_num_attr]

1. Missing values

msno.matrix(df_player_num_attr)
<AxesSubplot:>

png

  • If overall_rating is missing, all other attributes are also msissing.
df_player_num_attr.overall_rating.isna().sum()
836
  • There are only 836 cases that overall_rating and all other attributes are missing.
len(df_player_num_attr[~df_player_num_attr.overall_rating.isna()].player_api_id.unique())
11060
  • Also, even if we drop the cases that overall_rating is missing, we can check that each player’s attributes have been measured at least once.
  • So let’s just drop the cases that overall_rating is missing.
df_player_num_attr = df_player_num_attr[~df_player_num_attr.overall_rating.isna()]
msno.matrix(df_player_num_attr.sort_values(["player_api_id", "player_measured_date"]))
<AxesSubplot:>

png

df_player_num_attr.isna().sum()[df_player_num_attr.isna().sum() > 0]
volleys           1877
curve             1877
agility           1877
balance           1877
jumping           1877
vision            1877
sliding_tackle    1877
dtype: int64
  • There are some missing values in voleys, curve, agility, balance, jumping, vision, sliding_tackle variables.
  • Let’s check the correlation between attributes to check whether missing values ​​can be imputed by other variables.
plt.figure(figsize = (20, 5))

corr = round(df_player_num_attr[player_num_attr].corr(), 1)
corr = corr.loc[["volleys", "curve", "agility", "balance", "jumping", "vision", "sliding_tackle"], :]

sns.heatmap(corr, annot = True, cmap = "BrBG", vmin = -1, vmax = 1, 
            linewidths = 0.5, cbar_kws = {"shrink" : 0.5})

<AxesSubplot:>

png

  • We can see that there are absolute correlations over 0.7 between:
    • volleys: finishing, dribbling, curve, positioning, long_shots, dribbling, positioning, curve, ball_control, shot_power, penalties
    • curve: dribbling, long_shots, crossing, ball_control, free_kick_accuracy, volleys, short_passing, vision, positioning
    • agility: acceleration, sprint_speed, dribbling
    • balance: agility, height
    • jumping: none
    • vision: ball_control, short_passing, positioning, dribbling, long_shots, curve, free_kick_accuracy, crossing, volleys, long_passing, penalties, finishing
    • sliding_tackle: standing_tackle, marking, interceptions, aggression
  • There are plenty of other variables that have high correlation with the variables that have the missing values, except for the balance and jumping.
  • So, it seems to be possible to use the iterative imputation for the missing values from the player attributes, except for the balance and jumping.

  • However, since only 1%(1877 / 183142) of the data have missing values in banace and jumping columns, let’s also use the iterative imputation even for the balance and jumping.
target = df_player_num_attr.set_index(["player_api_id", "player_measured_date"])

index = target.index
col_names = target.columns

imp_player_attr = IterativeImputer(max_iter = 10, random_state = 42)
imp_player_attr.fit(target)
df_player_num_attr_imputed = pd.DataFrame(imp_player_attr.transform(target), 
                                          columns = col_names, index = index)

msno.matrix(df_player_num_attr_imputed)
<AxesSubplot:>

png

df_player_num_attr_imputed = df_player_num_attr_imputed.reset_index()
df_player_num_attr_imputed
player_api_id player_measured_date height weight overall_rating potential crossing finishing heading_accuracy short_passing volleys dribbling curve free_kick_accuracy long_passing ball_control acceleration sprint_speed agility reactions balance shot_power jumping stamina strength long_shots aggression interceptions positioning vision penalties marking standing_tackle sliding_tackle gk_diving gk_handling gk_kicking gk_positioning gk_reflexes
0 505942 2016-02-18 182.88 187.0 67.0 71.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0
1 505942 2015-11-19 182.88 187.0 67.0 71.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0
2 505942 2015-09-21 182.88 187.0 62.0 66.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 63.0 41.0 45.0 54.0 48.0 65.0 66.0 69.0 6.0 11.0 10.0 8.0 8.0
3 505942 2015-03-20 182.88 187.0 61.0 65.0 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0
4 505942 2007-02-22 182.88 187.0 61.0 65.0 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183137 39902 2009-08-30 180.34 176.0 83.0 85.0 84.0 77.0 59.0 89.0 77.0 84.0 86.0 78.0 84.0 85.0 66.0 72.0 77.0 86.0 73.0 76.0 58.0 72.0 67.0 81.0 56.0 78.0 86.0 88.0 83.0 22.0 31.0 30.0 9.0 20.0 84.0 20.0 20.0
183138 39902 2009-02-22 180.34 176.0 78.0 80.0 74.0 76.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 61.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0
183139 39902 2008-08-30 180.34 176.0 77.0 80.0 74.0 71.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 67.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0
183140 39902 2007-08-30 180.34 176.0 78.0 81.0 74.0 64.0 57.0 86.0 77.0 87.0 86.0 73.0 73.0 91.0 61.0 60.0 77.0 69.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 28.0 32.0 30.0 9.0 20.0 73.0 20.0 20.0
183141 39902 2007-02-22 180.34 176.0 80.0 81.0 74.0 68.0 57.0 88.0 77.0 87.0 86.0 53.0 78.0 91.0 58.0 64.0 77.0 66.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 38.0 32.0 30.0 9.0 9.0 78.0 7.0 15.0

183142 rows × 39 columns

2. Dimension reduction by PCA

  • In the match data, we have the information of 22 players id from the each team.
  • Since we will merge the player attributes to the match data, there are too many columns: 22 players * 35 attributes = 770 columns.
df_player_grouped_num_attr = df_player_num_attr_imputed.drop(["player_measured_date", "height", "weight"], axis = 1).groupby("player_api_id").mean()
df_player_grouped_num_attr
overall_rating potential crossing finishing heading_accuracy short_passing volleys dribbling curve free_kick_accuracy long_passing ball_control acceleration sprint_speed agility reactions balance shot_power jumping stamina strength long_shots aggression interceptions positioning vision penalties marking standing_tackle sliding_tackle gk_diving gk_handling gk_kicking gk_positioning gk_reflexes
player_api_id
2625 60.142857 61.142857 50.142857 47.285714 46.285714 56.500000 38.000000 54.142857 50.000000 49.928571 65.571429 59.142857 66.857143 63.285714 66.428571 49.714286 66.214286 66.714286 58.714286 77.285714 55.785714 56.857143 71.285714 62.071429 50.357143 55.285714 61.928571 62.928571 63.857143 56.571429 12.428571 12.857143 19.071429 10.357143 10.428571
2752 69.380952 70.380952 36.428571 37.428571 75.333333 54.809524 20.428571 44.857143 30.428571 18.857143 59.571429 55.571429 41.619048 42.190476 43.333333 60.857143 44.571429 57.428571 56.523810 60.904762 82.238095 19.571429 80.285714 71.428571 31.428571 52.428571 35.428571 71.095238 70.666667 65.571429 11.095238 6.095238 7.095238 9.095238 15.095238
2768 69.285714 70.571429 42.238095 42.761905 66.666667 64.714286 28.761905 40.238095 51.761905 20.047619 57.000000 56.380952 56.952381 58.904762 58.047619 69.809524 60.190476 38.761905 76.904762 59.285714 67.428571 19.428571 62.761905 74.714286 44.190476 57.761905 31.904762 71.666667 70.619048 79.714286 10.761905 15.904762 22.714286 15.142857 12.095238
2770 71.133333 73.533333 61.866667 67.666667 67.800000 72.533333 67.133333 75.066667 79.800000 68.666667 70.266667 76.733333 62.200000 59.800000 62.600000 65.933333 61.266667 68.333333 57.133333 51.000000 62.600000 73.000000 56.800000 44.933333 64.533333 71.600000 63.933333 30.200000 37.533333 25.000000 8.666667 15.666667 27.200000 16.333333 17.000000
2790 70.200000 75.800000 70.000000 37.400000 55.200000 68.000000 43.000000 60.800000 67.000000 55.400000 69.800000 66.800000 70.800000 71.200000 65.000000 69.200000 67.000000 46.400000 61.000000 70.400000 71.000000 42.400000 68.400000 63.400000 60.600000 60.000000 55.400000 74.600000 74.600000 69.000000 8.000000 17.000000 59.000000 16.600000 17.400000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
744907 51.909091 65.909091 42.454545 44.454545 47.454545 59.454545 42.454545 56.454545 46.454545 43.454545 58.454545 55.454545 66.000000 64.000000 61.000000 53.454545 74.000000 50.454545 63.000000 54.000000 49.909091 38.454545 36.454545 22.454545 48.454545 51.454545 52.454545 39.454545 48.454545 49.454545 8.454545 13.454545 12.454545 11.454545 8.454545
746419 59.000000 66.000000 55.000000 28.000000 50.000000 35.000000 29.000000 67.000000 37.000000 39.000000 35.000000 55.000000 77.000000 76.000000 70.000000 60.000000 74.000000 29.000000 55.000000 63.000000 46.000000 27.000000 59.000000 53.000000 55.000000 40.000000 43.000000 55.000000 69.000000 62.000000 7.000000 10.000000 7.000000 9.000000 9.000000
748432 58.000000 68.000000 48.000000 26.000000 57.000000 49.000000 24.000000 41.000000 39.000000 20.000000 41.000000 42.000000 64.000000 68.000000 50.000000 48.000000 52.000000 46.000000 68.000000 50.000000 67.000000 25.000000 52.000000 55.000000 38.000000 36.000000 45.000000 63.000000 69.000000 68.000000 8.000000 8.000000 12.000000 12.000000 6.000000
750435 56.444444 70.444444 35.000000 57.000000 55.000000 60.000000 52.000000 58.555556 56.000000 37.000000 54.000000 56.333333 65.666667 63.555556 60.222222 61.000000 78.000000 57.000000 58.000000 57.000000 46.000000 50.000000 34.000000 13.000000 56.333333 58.000000 61.000000 18.000000 19.000000 21.000000 9.000000 10.000000 8.000000 10.000000 11.000000
750584 58.000000 66.000000 47.000000 39.000000 53.000000 60.000000 37.000000 54.000000 40.000000 45.000000 55.000000 56.000000 63.000000 60.000000 51.000000 54.000000 76.000000 59.000000 58.000000 55.000000 57.000000 38.000000 62.000000 56.000000 41.000000 47.000000 52.000000 56.000000 66.000000 62.000000 8.000000 9.000000 13.000000 11.000000 13.000000

11060 rows × 35 columns

plt.figure(figsize = (20, 20))

corr = round(df_player_grouped_num_attr.corr(), 1)

sns.heatmap(corr, annot = True, cmap = "BrBG", vmin = -1, vmax = 1, 
            linewidths = 0.5, cbar_kws = {"shrink" : 0.5})
<AxesSubplot:>

png

  • Aslo, correlations between player numerical attributes are very high.
  • So, I will use the PCA to reduce the dimension of the numerical player attributes.

  • Note that we set the the last season (2015/2016 seasons) as the test set.
  • So we will fit the PCA to the only train data and transform all data by fitted PC loadings.

  • Check the season start month to exclude the player attributes information measured before the 2015 season.
df_match_basic = pd.read_csv("../data/df_match_basic.csv")
for season in ["2015/2016"]:
    print(f"== Season: {season}")
    print(df_match_basic[df_match_basic.season == season].sort_values("match_date").groupby(["league_id", "season"]).min().match_date)
== Season: 2015/2016
league_id  season   
1          2015/2016    2015-07-24
1729       2015/2016    2015-08-08
4769       2015/2016    2015-08-07
7809       2015/2016    2015-08-14
10257      2015/2016    2015-08-22
13274      2015/2016    2015-08-08
15722      2015/2016    2015-07-17
17642      2015/2016    2015-08-14
19694      2015/2016    2015-08-01
21518      2015/2016    2015-08-21
24558      2015/2016    2015-07-18
Name: match_date, dtype: object
  • We can use the player attributes information measured before the 2015-07-17
player_cutoff_date = pd.to_datetime("2015-07-17")
player_cutoff_date
Timestamp('2015-07-17 00:00:00')
df_player_num_attr_imputed
player_api_id player_measured_date height weight overall_rating potential crossing finishing heading_accuracy short_passing volleys dribbling curve free_kick_accuracy long_passing ball_control acceleration sprint_speed agility reactions balance shot_power jumping stamina strength long_shots aggression interceptions positioning vision penalties marking standing_tackle sliding_tackle gk_diving gk_handling gk_kicking gk_positioning gk_reflexes
0 505942 2016-02-18 182.88 187.0 67.0 71.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0
1 505942 2015-11-19 182.88 187.0 67.0 71.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 71.0 70.0 45.0 54.0 48.0 65.0 69.0 69.0 6.0 11.0 10.0 8.0 8.0
2 505942 2015-09-21 182.88 187.0 62.0 66.0 49.0 44.0 71.0 61.0 44.0 51.0 45.0 39.0 64.0 49.0 60.0 64.0 59.0 47.0 65.0 55.0 58.0 54.0 76.0 35.0 63.0 41.0 45.0 54.0 48.0 65.0 66.0 69.0 6.0 11.0 10.0 8.0 8.0
3 505942 2015-03-20 182.88 187.0 61.0 65.0 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0
4 505942 2007-02-22 182.88 187.0 61.0 65.0 48.0 43.0 70.0 60.0 43.0 50.0 44.0 38.0 63.0 48.0 60.0 64.0 59.0 46.0 65.0 54.0 58.0 54.0 76.0 34.0 62.0 40.0 44.0 53.0 47.0 62.0 63.0 66.0 5.0 10.0 9.0 7.0 7.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183137 39902 2009-08-30 180.34 176.0 83.0 85.0 84.0 77.0 59.0 89.0 77.0 84.0 86.0 78.0 84.0 85.0 66.0 72.0 77.0 86.0 73.0 76.0 58.0 72.0 67.0 81.0 56.0 78.0 86.0 88.0 83.0 22.0 31.0 30.0 9.0 20.0 84.0 20.0 20.0
183138 39902 2009-02-22 180.34 176.0 78.0 80.0 74.0 76.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 61.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0
183139 39902 2008-08-30 180.34 176.0 77.0 80.0 74.0 71.0 53.0 84.0 77.0 85.0 86.0 74.0 73.0 86.0 66.0 67.0 77.0 74.0 73.0 75.0 58.0 66.0 65.0 73.0 67.0 64.0 72.0 88.0 70.0 32.0 31.0 30.0 9.0 20.0 73.0 20.0 20.0
183140 39902 2007-08-30 180.34 176.0 78.0 81.0 74.0 64.0 57.0 86.0 77.0 87.0 86.0 73.0 73.0 91.0 61.0 60.0 77.0 69.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 28.0 32.0 30.0 9.0 20.0 73.0 20.0 20.0
183141 39902 2007-02-22 180.34 176.0 80.0 81.0 74.0 68.0 57.0 88.0 77.0 87.0 86.0 53.0 78.0 91.0 58.0 64.0 77.0 66.0 73.0 72.0 58.0 67.0 59.0 78.0 63.0 63.0 68.0 88.0 53.0 38.0 32.0 30.0 9.0 9.0 78.0 7.0 15.0

183142 rows × 39 columns

player_cutoff_bool = pd.to_datetime(df_player_num_attr_imputed.player_measured_date) < player_cutoff_date
df_player_grouped_num_attr = df_player_num_attr_imputed.drop(["player_measured_date", "height", "weight"], axis = 1)[player_cutoff_bool].groupby("player_api_id").mean()
df_player_grouped_num_attr
overall_rating potential crossing finishing heading_accuracy short_passing volleys dribbling curve free_kick_accuracy long_passing ball_control acceleration sprint_speed agility reactions balance shot_power jumping stamina strength long_shots aggression interceptions positioning vision penalties marking standing_tackle sliding_tackle gk_diving gk_handling gk_kicking gk_positioning gk_reflexes
player_api_id
2625 60.142857 61.142857 50.142857 47.285714 46.285714 56.500000 38.000000 54.142857 50.000000 49.928571 65.571429 59.142857 66.857143 63.285714 66.428571 49.714286 66.214286 66.714286 58.714286 77.285714 55.785714 56.857143 71.285714 62.071429 50.357143 55.285714 61.928571 62.928571 63.857143 56.571429 12.428571 12.857143 19.071429 10.357143 10.428571
2752 69.105263 70.210526 36.157895 37.157895 74.631579 54.368421 20.157895 45.263158 30.157895 18.842105 59.421053 55.421053 42.578947 42.526316 44.315789 60.526316 45.157895 57.157895 56.684211 60.789474 81.947368 19.526316 79.894737 71.473684 31.157895 52.157895 35.157895 71.000000 70.315789 65.421053 11.000000 6.000000 7.000000 9.000000 15.000000
2768 67.562500 69.250000 41.687500 42.062500 65.000000 61.562500 28.062500 39.062500 52.312500 19.437500 56.062500 54.312500 58.687500 60.687500 58.062500 67.562500 59.937500 38.062500 76.250000 64.000000 66.000000 19.250000 62.062500 72.812500 45.500000 57.062500 36.875000 69.812500 68.687500 80.062500 10.375000 16.187500 25.750000 15.500000 12.750000
2770 71.133333 73.533333 61.866667 67.666667 67.800000 72.533333 67.133333 75.066667 79.800000 68.666667 70.266667 76.733333 62.200000 59.800000 62.600000 65.933333 61.266667 68.333333 57.133333 51.000000 62.600000 73.000000 56.800000 44.933333 64.533333 71.600000 63.933333 30.200000 37.533333 25.000000 8.666667 15.666667 27.200000 16.333333 17.000000
2790 70.200000 75.800000 70.000000 37.400000 55.200000 68.000000 43.000000 60.800000 67.000000 55.400000 69.800000 66.800000 70.800000 71.200000 65.000000 69.200000 67.000000 46.400000 61.000000 70.400000 71.000000 42.400000 68.400000 63.400000 60.600000 60.000000 55.400000 74.600000 74.600000 69.000000 8.000000 17.000000 59.000000 16.600000 17.400000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
744907 51.000000 65.000000 42.000000 44.000000 47.000000 59.000000 42.000000 56.000000 46.000000 43.000000 58.000000 55.000000 66.000000 64.000000 61.000000 53.000000 74.000000 50.000000 63.000000 54.000000 49.000000 38.000000 36.000000 22.000000 48.000000 51.000000 52.000000 39.000000 48.000000 49.000000 8.000000 13.000000 12.000000 11.000000 8.000000
746419 59.000000 66.000000 55.000000 28.000000 50.000000 35.000000 29.000000 67.000000 37.000000 39.000000 35.000000 55.000000 77.000000 76.000000 70.000000 60.000000 74.000000 29.000000 55.000000 63.000000 46.000000 27.000000 59.000000 53.000000 55.000000 40.000000 43.000000 55.000000 69.000000 62.000000 7.000000 10.000000 7.000000 9.000000 9.000000
748432 58.000000 68.000000 48.000000 26.000000 57.000000 49.000000 24.000000 41.000000 39.000000 20.000000 41.000000 42.000000 64.000000 68.000000 50.000000 48.000000 52.000000 46.000000 68.000000 50.000000 67.000000 25.000000 52.000000 55.000000 38.000000 36.000000 45.000000 63.000000 69.000000 68.000000 8.000000 8.000000 12.000000 12.000000 6.000000
750435 56.000000 70.000000 35.000000 57.000000 55.000000 60.000000 52.000000 58.000000 56.000000 37.000000 54.000000 56.000000 64.000000 62.000000 55.600000 61.000000 78.000000 57.000000 58.000000 57.000000 46.000000 50.000000 34.000000 13.000000 56.000000 58.000000 61.000000 18.000000 19.000000 21.000000 9.000000 10.000000 8.000000 10.000000 11.000000
750584 58.000000 66.000000 47.000000 39.000000 53.000000 60.000000 37.000000 54.000000 40.000000 45.000000 55.000000 56.000000 63.000000 60.000000 51.000000 54.000000 76.000000 59.000000 58.000000 55.000000 57.000000 38.000000 62.000000 56.000000 41.000000 47.000000 52.000000 56.000000 66.000000 62.000000 8.000000 9.000000 13.000000 11.000000 13.000000

11060 rows × 35 columns

pipe = Pipeline([
    ('scale',StandardScaler()),
    ('pca', PCA(n_components = 10, random_state = 42)),
])
res = pipe.fit_transform(df_player_grouped_num_attr)
plt.figure(figsize = (14, 8))

plt.bar(range(1,len(pipe.named_steps.pca.explained_variance_ratio_) + 1), pipe.named_steps.pca.explained_variance_ratio_, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(1,len(np.cumsum(pipe.named_steps.pca.explained_variance_ratio_)) + 1), np.cumsum(pipe.named_steps.pca.explained_variance_ratio_), where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio', fontsize = 14)
plt.xlabel('Principal component index', fontsize = 14)
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()

png

  • 5 variables can explain about 80% of the total variance.
  • That is, PCA variables can reduce the number of variables without significant loss of information.

  • Let’s check 5 PC variables.
pipe = Pipeline([
    ('scale',StandardScaler()),
    ('pca', PCA(n_components = 5, random_state = 42)),
])
res = pipe.fit_transform(df_player_grouped_num_attr)
_ = pca_results(df_player_grouped_num_attr, pipe.named_steps.pca)

png

  • Since our goal is to reduce the numerical attributes variables without significant loss of information, let’s just use 5 PC variables for each player.
  • So for each match, we reduced from 22 players * 35 attributes = 770 columns to 22 players * 5 PC = 110 columns.
cols = df_player_grouped_num_attr.columns.tolist()
cols.append("player_api_id")
cols.append("player_measured_date")
df_player_num_attr_pc  = pipe.fit_transform(df_player_num_attr_imputed[cols].set_index(["player_api_id", "player_measured_date"]))
df_player_num_attr_pc = pd.DataFrame(df_player_num_attr_pc, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"], 
                                     index = df_player_num_attr_imputed[cols].set_index(["player_api_id", "player_measured_date"]).index).reset_index()
df_player_num_attr_pc
player_api_id player_measured_date PC1 PC2 PC3 PC4 PC5
0 505942 2016-02-18 1.046944 2.688259 -1.529901 0.360865 0.548021
1 505942 2015-11-19 1.046944 2.688259 -1.529901 0.360865 0.548021
2 505942 2015-09-21 1.400003 2.036745 -2.359132 0.536742 0.502532
3 505942 2015-03-20 1.615408 1.927571 -2.657865 0.548879 0.399693
4 505942 2007-02-22 1.615408 1.927571 -2.657865 0.548879 0.399693
... ... ... ... ... ... ... ...
183137 39902 2009-08-30 -4.991884 -2.921361 4.090605 0.593824 2.434563
183138 39902 2009-02-22 -3.766452 -2.749777 2.543465 0.492997 2.349990
183139 39902 2008-08-30 -3.745176 -2.589656 2.519182 0.452451 2.349129
183140 39902 2007-08-30 -3.382604 -2.497503 2.251858 0.317332 2.726097
183141 39902 2007-02-22 -3.659867 -2.058531 1.834322 0.277823 2.463788

183142 rows × 7 columns

player_num_collected = pd.DataFrame(df_player_num_attr_pc.groupby("player_api_id").count().player_measured_date)
player_num_collected.columns = ["num_collected"]
player_num_collected
num_collected
player_api_id
2625 14
2752 21
2768 21
2770 15
2790 5
... ...
744907 11
746419 2
748432 2
750435 9
750584 2

11060 rows × 1 columns

df_player_num_attr_pc.groupby("player_api_id").count().player_measured_date.describe()
count    11060.000000
mean        16.558951
std          9.361642
min          2.000000
25%          8.000000
50%         16.000000
75%         23.000000
max         56.000000
Name: player_measured_date, dtype: float64
my_histogram(player_num_collected, "num_collected")

png

  • Players’ attributes are collected in different days.
  • The player’s characteristic data was collected 2 times for the fewest players and 56 times for the most players.
  • So, we will use player attributes collected closest to match dates.

  • Let’s merge all starting players’ PC variables collected closest to match dates.
df_match_player = pd.read_csv("../data/df_match_player.csv")
df_match_basic[["match_api_id", "match_date"]].merge(df_match_player, how = "left", on = "match_api_id")

match_api_id match_date home_player_X1 home_player_X2 home_player_X3 home_player_X4 home_player_X5 home_player_X6 home_player_X7 home_player_X8 home_player_X9 home_player_X10 home_player_X11 away_player_X1 away_player_X2 away_player_X3 away_player_X4 away_player_X5 away_player_X6 away_player_X7 away_player_X8 away_player_X9 away_player_X10 away_player_X11 home_player_Y1 home_player_Y2 home_player_Y3 home_player_Y4 home_player_Y5 home_player_Y6 home_player_Y7 home_player_Y8 home_player_Y9 home_player_Y10 home_player_Y11 away_player_Y1 away_player_Y2 away_player_Y3 away_player_Y4 away_player_Y5 away_player_Y6 away_player_Y7 away_player_Y8 away_player_Y9 away_player_Y10 away_player_Y11 home_player_1 home_player_2 home_player_3 home_player_4 home_player_5 home_player_6 home_player_7 home_player_8 home_player_9 home_player_10 home_player_11 away_player_1 away_player_2 away_player_3 away_player_4 away_player_5 away_player_6 away_player_7 away_player_8 away_player_9 away_player_10 away_player_11
0 492473 2008-08-17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 492474 2008-08-16 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 492475 2008-08-16 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 492476 2008-08-17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 492477 2008-08-16 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25974 1992091 2015-09-22 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 42231.0 678384.0 95220.0 638592.0 413155.0 45780.0 171229.0 67333.0 119839.0 143790.0 195215.0 462944.0 563066.0 8800.0 67304.0 158253.0 133126.0 186524.0 93223.0 121115.0 232110.0 289732.0
25975 1992092 2015-09-23 1.0 3.0 5.0 7.0 2.0 4.0 6.0 8.0 5.0 4.0 6.0 1.0 2.0 4.0 6.0 8.0 3.0 5.0 7.0 3.0 5.0 7.0 1.0 3.0 3.0 3.0 7.0 7.0 7.0 7.0 9.0 11.0 11.0 1.0 3.0 3.0 3.0 3.0 7.0 7.0 7.0 10.0 10.0 10.0 33272.0 41621.0 25813.0 257845.0 114735.0 42237.0 113227.0 358156.0 32343.0 531309.0 37257.0 42276.0 114792.0 150007.0 178119.0 27232.0 570830.0 260708.0 201704.0 36382.0 34082.0 95257.0
25976 1992093 2015-09-23 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 157856.0 274779.0 177689.0 294256.0 42258.0 39979.0 173936.0 147959.0 451983.0 80564.0 289472.0 10637.0 67349.0 202663.0 32597.0 114794.0 188114.0 25840.0 482200.0 95230.0 451335.0 275122.0
25977 1992094 2015-09-22 1.0 2.0 4.0 6.0 8.0 3.0 7.0 5.0 3.0 5.0 7.0 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 3.0 3.0 3.0 3.0 7.0 7.0 7.0 10.0 10.0 10.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 NaN 8881.0 173534.0 39646.0 282287.0 340790.0 393337.0 8893.0 614454.0 93229.0 178142.0 274776.0 121080.0 197757.0 260964.0 231614.0 113235.0 41116.0 462608.0 42262.0 92252.0 194532.0
25978 1992095 2015-09-23 1.0 2.0 4.0 6.0 8.0 2.0 4.0 6.0 8.0 4.0 6.0 1.0 2.0 4.0 6.0 8.0 4.0 6.0 3.0 5.0 7.0 5.0 1.0 3.0 3.0 3.0 3.0 7.0 7.0 7.0 7.0 10.0 10.0 1.0 3.0 3.0 3.0 3.0 6.0 6.0 8.0 8.0 8.0 11.0 274787.0 492132.0 108451.0 25815.0 94553.0 384376.0 598355.0 36785.0 45174.0 302079.0 71764.0 156175.0 95216.0 172768.0 22834.0 458806.0 207234.0 25772.0 40274.0 34035.0 41726.0 527103.0

25979 rows × 68 columns

df_match_lineup = df_match_basic[["match_api_id", "match_date"]].merge(df_match_player, how = "left", on = "match_api_id") \
                                [["match_date", "match_api_id",
                                  "home_player_1", "home_player_2", "home_player_3", "home_player_4", 
                                  "home_player_5", "home_player_6", "home_player_7", "home_player_8",
                                  "home_player_9", "home_player_10", "home_player_11", 
                                  "away_player_1", "away_player_2", "away_player_3", "away_player_4", 
                                  "away_player_5", "away_player_6", "away_player_7", "away_player_8",
                                  "away_player_9", "away_player_10", "away_player_11"]]
msno.matrix(df_match_lineup.sort_values("match_date"))
<AxesSubplot:>

png

  • If there is at least one missing value in the starting lineup, let’s exclude the match from the analysis.
df_match_lineup = df_match_lineup.dropna()
df_match_lineup.shape
(21374, 24)
  • Use the pc variables measured closest to the match date for each player.
df_player_num_attr_pc
player_api_id player_measured_date PC1 PC2 PC3 PC4 PC5
0 505942 2016-02-18 1.046944 2.688259 -1.529901 0.360865 0.548021
1 505942 2015-11-19 1.046944 2.688259 -1.529901 0.360865 0.548021
2 505942 2015-09-21 1.400003 2.036745 -2.359132 0.536742 0.502532
3 505942 2015-03-20 1.615408 1.927571 -2.657865 0.548879 0.399693
4 505942 2007-02-22 1.615408 1.927571 -2.657865 0.548879 0.399693
... ... ... ... ... ... ... ...
183137 39902 2009-08-30 -4.991884 -2.921361 4.090605 0.593824 2.434563
183138 39902 2009-02-22 -3.766452 -2.749777 2.543465 0.492997 2.349990
183139 39902 2008-08-30 -3.745176 -2.589656 2.519182 0.452451 2.349129
183140 39902 2007-08-30 -3.382604 -2.497503 2.251858 0.317332 2.726097
183141 39902 2007-02-22 -3.659867 -2.058531 1.834322 0.277823 2.463788

183142 rows × 7 columns

df_match_player_attr_pcs = df_match_lineup[["match_api_id"]]
for i, col in enumerate(["home_player_1", "home_player_2", "home_player_3", "home_player_4", 
                         "home_player_5", "home_player_6", "home_player_7", "home_player_8",
                         "home_player_9", "home_player_10", "home_player_11"]):
    
        merged = df_match_lineup[["match_date", "match_api_id", col]].merge(df_player_num_attr_pc,
                                                                            how = "left", left_on = col,  right_on = "player_api_id") \
                                                                            .sort_values(["player_api_id", "match_date", "player_measured_date"])      
                                                  
        merged["date_diff"] = (pd.to_datetime(merged.player_measured_date) - pd.to_datetime(merged.match_date)).dt.days
        merged = merged[merged.date_diff < 0]
        merged_get_first = merged.sort_values(["player_api_id", "match_date", "date_diff"], ascending = [True, True, False]).groupby(["player_api_id", "match_date"]).first()
        merged_get_first = merged_get_first[["match_api_id", "PC1", "PC2", "PC3", "PC4", "PC5"]].rename(columns = {"PC1": f"home_player_{i + 1}_pc_1", 
                                                                                                                   "PC2": f"home_player_{i + 1}_pc_2",
                                                                                                                   "PC3": f"home_player_{i + 1}_pc_3",
                                                                                                                   "PC4": f"home_player_{i + 1}_pc_4",
                                                                                                                   "PC5": f"home_player_{i + 1}_pc_5"})
        
        
        df_match_player_attr_pcs = df_match_player_attr_pcs.merge(merged_get_first, how = "left", on = "match_api_id")
for i, col in enumerate(["away_player_1", "away_player_2", "away_player_3", "away_player_4", 
                         "away_player_5", "away_player_6", "away_player_7", "away_player_8",
                         "away_player_9", "away_player_10", "away_player_11"]):
    
        merged = df_match_lineup[["match_date", "match_api_id", col]].merge(df_player_num_attr_pc,
                                                                            how = "left", left_on = col,  right_on = "player_api_id") \
                                                                            .sort_values(["player_api_id", "match_date", "player_measured_date"])      
                                                  
        merged["date_diff"] = (pd.to_datetime(merged.player_measured_date) - pd.to_datetime(merged.match_date)).dt.days
        merged = merged[merged.date_diff < 0]
        merged_get_first = merged.sort_values(["player_api_id", "match_date", "date_diff"], ascending = [True, True, False]).groupby(["player_api_id", "match_date"]).first()
        merged_get_first = merged_get_first[["match_api_id", "PC1", "PC2", "PC3", "PC4", "PC5"]].rename(columns = {"PC1": f"away_player_{i + 1}_pc_1", 
                                                                                                                   "PC2": f"away_player_{i + 1}_pc_2",
                                                                                                                   "PC3": f"away_player_{i + 1}_pc_3",
                                                                                                                   "PC4": f"away_player_{i + 1}_pc_4",
                                                                                                                   "PC5": f"away_player_{i + 1}_pc_5"})
        
        
        df_match_player_attr_pcs = df_match_player_attr_pcs.merge(merged_get_first, how = "left", on = "match_api_id")
df_match_player_attr_pcs
match_api_id home_player_1_pc_1 home_player_1_pc_2 home_player_1_pc_3 home_player_1_pc_4 home_player_1_pc_5 home_player_2_pc_1 home_player_2_pc_2 home_player_2_pc_3 home_player_2_pc_4 home_player_2_pc_5 home_player_3_pc_1 home_player_3_pc_2 home_player_3_pc_3 home_player_3_pc_4 home_player_3_pc_5 home_player_4_pc_1 home_player_4_pc_2 home_player_4_pc_3 home_player_4_pc_4 home_player_4_pc_5 home_player_5_pc_1 home_player_5_pc_2 home_player_5_pc_3 home_player_5_pc_4 home_player_5_pc_5 home_player_6_pc_1 home_player_6_pc_2 home_player_6_pc_3 home_player_6_pc_4 home_player_6_pc_5 home_player_7_pc_1 home_player_7_pc_2 home_player_7_pc_3 home_player_7_pc_4 home_player_7_pc_5 home_player_8_pc_1 home_player_8_pc_2 home_player_8_pc_3 home_player_8_pc_4 home_player_8_pc_5 home_player_9_pc_1 home_player_9_pc_2 home_player_9_pc_3 home_player_9_pc_4 home_player_9_pc_5 home_player_10_pc_1 home_player_10_pc_2 home_player_10_pc_3 home_player_10_pc_4 home_player_10_pc_5 home_player_11_pc_1 home_player_11_pc_2 home_player_11_pc_3 home_player_11_pc_4 home_player_11_pc_5 away_player_1_pc_1 away_player_1_pc_2 away_player_1_pc_3 away_player_1_pc_4 away_player_1_pc_5 away_player_2_pc_1 away_player_2_pc_2 away_player_2_pc_3 away_player_2_pc_4 away_player_2_pc_5 away_player_3_pc_1 away_player_3_pc_2 away_player_3_pc_3 away_player_3_pc_4 away_player_3_pc_5 away_player_4_pc_1 away_player_4_pc_2 away_player_4_pc_3 away_player_4_pc_4 away_player_4_pc_5 away_player_5_pc_1 away_player_5_pc_2 away_player_5_pc_3 away_player_5_pc_4 away_player_5_pc_5 away_player_6_pc_1 away_player_6_pc_2 away_player_6_pc_3 away_player_6_pc_4 away_player_6_pc_5 away_player_7_pc_1 away_player_7_pc_2 away_player_7_pc_3 away_player_7_pc_4 away_player_7_pc_5 away_player_8_pc_1 away_player_8_pc_2 away_player_8_pc_3 away_player_8_pc_4 away_player_8_pc_5 away_player_9_pc_1 away_player_9_pc_2 away_player_9_pc_3 away_player_9_pc_4 away_player_9_pc_5 away_player_10_pc_1 away_player_10_pc_2 away_player_10_pc_3 away_player_10_pc_4 away_player_10_pc_5 away_player_11_pc_1 away_player_11_pc_2 away_player_11_pc_3 away_player_11_pc_4 away_player_11_pc_5
0 493017 9.172915 -0.705596 1.028500 -0.044401 1.246272 3.957784 1.650964 -2.348632 -0.837480 0.223750 -0.817702 -0.589548 0.195433 -2.144582 1.524434 3.108730 0.633708 -1.772338 0.807727 1.684577 0.615229 -0.611994 -0.845425 0.395221 3.107710 -0.038702 -0.551509 -0.280096 -0.068082 2.645878 1.086047 -2.583583 -0.596348 -1.915882 -0.987251 -0.845848 -0.053597 0.746034 -0.515174 1.763346 4.244623 1.088030 -2.054898 -0.414998 0.270225 -0.559441 1.233335 0.426201 0.609910 1.364324 1.472274 -0.298277 -1.530989 0.743621 -0.875547 9.794795 -0.549117 1.941560 0.281992 0.521328 -1.886782 1.005850 1.291888 0.172012 1.212622 1.320201 1.065244 -0.406996 -1.803778 0.573483 2.628391 0.940615 -0.609534 -1.586848 0.652795 3.207876 0.685128 -1.480756 -1.499414 0.134771 -1.360311 -2.979972 1.181522 -0.726996 1.161218 -1.735000 -0.323721 0.977427 -1.047350 1.307837 -0.004187 1.646099 0.543917 0.101722 0.374692 -1.836650 -2.551881 1.212316 0.139711 1.155972 -0.916797 -1.202214 0.022668 -0.118531 -0.332637 0.628660 -1.751083 -0.030693 0.418133 -0.206630
1 493025 6.467731 -2.125163 3.092089 -0.930974 1.172527 0.390653 1.341612 0.109198 -0.418330 0.921304 2.673401 1.688787 -0.346764 0.032250 -0.148238 0.544724 0.856618 -0.234054 -0.185530 1.496783 0.349558 -0.322471 0.311433 -0.720916 0.765089 -2.997770 -1.279240 1.492689 -0.174247 1.728035 -1.262416 0.290702 1.097804 0.377514 1.681666 -1.035827 -0.034337 1.133088 0.899750 1.451945 -1.970162 -1.958911 1.072887 0.806278 2.735079 -1.206227 -2.567254 0.401355 -0.240084 1.552955 -0.417405 -0.845298 0.077456 -1.026087 -0.249720 6.746068 -1.225452 5.441467 -1.741143 0.153029 -0.720625 1.362837 0.950063 -0.430052 1.920381 1.321520 2.234319 0.546943 0.392576 1.081937 -1.157135 1.181325 0.901247 0.722505 0.967595 -1.573349 0.366277 1.085719 -0.362088 0.794429 -2.362155 0.960996 2.341796 0.480765 1.235871 -2.036662 1.423269 1.975662 0.456836 0.564672 -1.002934 -2.335553 0.470491 -0.111478 0.585989 -1.407944 -2.205414 1.069296 0.053596 0.809813 -2.259696 -2.778135 1.816538 1.407815 0.022775 0.169010 -2.319129 0.361658 0.935315 -1.877390
2 493027 7.587977 -0.669761 3.351410 -1.725204 0.971832 -0.659142 2.447278 1.853202 -0.962117 0.277773 0.782575 2.059493 1.311728 -0.983126 0.652947 0.750721 3.078327 1.228950 0.540478 0.034199 0.361849 2.339354 1.689313 0.713260 0.455514 -3.759299 1.204148 3.191002 -0.784951 0.913294 -3.018176 0.545416 2.535436 -0.218828 2.658318 -2.589758 0.570906 1.654008 0.345163 0.580780 -3.859427 -1.870653 2.659663 -0.725774 2.171793 -0.199689 -2.415793 -0.317491 0.067180 0.943378 -1.056848 -1.205950 1.067483 1.246078 -0.070047 8.942737 -2.146976 1.394474 -1.573403 -0.305385 0.326027 1.666843 0.694338 0.105792 1.385551 1.716745 1.133649 -0.670094 -1.374712 0.468374 1.822806 1.126330 -0.789073 -1.478142 0.878564 5.021780 0.645976 -1.954515 -1.642732 0.230055 0.882744 1.703690 0.053693 0.563981 0.978446 -0.780143 -0.700812 0.612057 1.416142 2.059364 -0.691788 -2.016431 0.356132 -1.164456 2.251789 1.464444 0.379332 -0.902796 -0.837762 0.211743 1.923084 -2.893726 -2.042800 -1.018973 -1.292926 0.416905 -2.210474 -0.336256 0.054563 2.588626
3 493034 9.172915 -0.705596 1.028500 -0.044401 1.246272 3.957784 1.650964 -2.348632 -0.837480 0.223750 -0.817702 -0.589548 0.195433 -2.144582 1.524434 -0.559441 1.233335 0.426201 0.609910 1.364324 -0.845848 -0.053597 0.746034 -0.515174 1.763346 0.615229 -0.611994 -0.845425 0.395221 3.107710 3.108730 0.633708 -1.772338 0.807727 1.684577 1.086047 -2.583583 -0.596348 -1.915882 -0.987251 4.244623 1.088030 -2.054898 -0.414998 0.270225 -0.105750 0.211866 0.619067 0.766076 -0.979514 1.472274 -0.298277 -1.530989 0.743621 -0.875547 7.587977 -0.669761 3.351410 -1.725204 0.971832 -0.659142 2.447278 1.853202 -0.962117 0.277773 0.361849 2.339354 1.689313 0.713260 0.455514 1.052194 2.080899 0.370792 -0.782729 0.394531 2.077104 1.903504 0.684877 1.471026 -0.492695 -3.759299 1.204148 3.191002 -0.784951 0.913294 -0.631336 -1.917288 0.404582 -1.604514 -0.334234 -3.859427 -1.870653 2.659663 -0.725774 2.171793 -3.018176 0.545416 2.535436 -0.218828 2.658318 -2.589758 0.570906 1.654008 0.345163 0.580780 -1.056848 -1.205950 1.067483 1.246078 -0.070047
4 493040 8.942737 -2.146976 1.394474 -1.573403 -0.305385 0.326027 1.666843 0.694338 0.105792 1.385551 1.831429 1.436093 -0.753915 -1.703966 1.116009 1.716745 1.133649 -0.670094 -1.374712 0.468374 1.822806 1.126330 -0.789073 -1.478142 0.878564 0.882744 1.703690 0.053693 0.563981 0.978446 -0.780143 -0.700812 0.612057 1.416142 2.059364 -0.691788 -2.016431 0.356132 -1.164456 2.251789 0.416905 -2.210474 -0.336256 0.054563 2.588626 1.923084 -2.893726 -2.042800 -1.018973 -1.292926 1.464444 0.379332 -0.902796 -0.837762 0.211743 9.554376 -0.788436 2.277353 0.709577 0.731361 1.970444 1.864556 -0.597320 0.168243 0.246152 4.470705 2.449109 -1.396639 -0.315235 0.008914 1.829497 1.079852 -2.363541 -0.545278 0.775451 4.019384 0.341251 -2.730811 -1.093104 0.247451 0.682845 0.218515 -0.581395 0.951872 2.794600 -0.793495 -0.740710 0.492791 0.151706 2.489086 1.933672 0.891587 -1.109525 -0.792216 -0.402166 -0.203211 -1.625096 -0.009942 0.220891 1.091029 0.344323 -3.281820 -1.047589 -0.870665 0.278254 0.817798 -2.420838 -0.361229 0.689072 -0.850103
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
21369 1992089 10.658392 -1.977648 0.867377 0.480266 1.074043 2.309500 0.931667 -3.860209 0.540601 -0.235396 4.829365 3.813902 -2.190143 2.896975 0.303158 4.363347 3.346856 -2.511394 1.256390 -1.264653 0.693648 2.017825 -1.753765 -2.152817 -0.662817 2.162406 -2.749882 -4.593596 0.434768 0.841622 -0.817444 1.999270 -0.449227 -0.166485 -0.336412 1.345606 0.069802 -3.170566 -0.790618 1.344050 -1.052756 0.131148 -1.406108 -0.920376 1.149814 -0.756354 -0.555587 -0.498005 0.934758 -2.606863 -2.571317 -2.788252 -0.783057 -0.228549 0.825315 11.297455 -1.943311 -0.622608 1.229927 2.987196 0.715893 1.452546 -2.111059 -0.524585 0.158115 5.742015 3.143138 -3.280495 0.901933 -1.055971 3.559549 2.023672 -2.954817 1.214646 0.395441 -0.652392 0.617508 -2.119288 0.420918 -0.312378 -1.179215 0.990786 -0.758331 0.280326 -1.019536 -0.983237 1.033960 -0.714609 -0.027433 0.645499 -0.468185 -2.492838 -2.135542 0.672037 1.175961 0.089223 -1.409898 -2.184536 0.860498 2.273182 -1.604118 -1.071515 -1.719817 -0.597994 0.268359 -2.027482 -0.929774 -0.429955 -0.408179 -0.159691
21370 1992091 12.241704 -2.064268 0.575912 0.027214 0.566339 2.941632 1.263739 -2.564271 -0.317355 0.251567 -0.338978 1.001774 -1.054876 0.023669 0.503527 3.322061 2.191727 -1.431455 -0.935647 -2.106780 0.805775 2.313243 -1.001739 -0.734564 -1.002397 0.167600 0.748910 -2.441020 -0.128875 1.354357 -0.992210 1.486864 -0.837892 -0.283414 0.885329 -0.144093 -1.113188 -1.698684 0.131062 0.388273 -1.074581 -2.141164 -1.372831 0.593159 -0.060973 -1.374814 -2.597658 -1.353337 -0.079045 0.449154 1.613120 -2.543332 -2.596794 1.483082 0.105411 12.290704 -2.430347 -1.144252 -0.197618 0.806754 4.561174 1.116819 -3.853639 -1.449020 -0.008049 0.859240 1.593152 -1.523836 0.182434 0.923206 3.366277 2.096648 -2.560049 1.302320 0.621418 -0.179065 1.542439 -0.883813 -1.534520 -0.929437 -1.274091 1.044816 -0.632610 0.214867 -1.155989 -1.165921 1.062657 -0.462190 -0.128853 0.815702 -0.551649 -3.139936 -1.871850 0.919162 1.101761 -0.895229 -1.357762 -1.203311 0.502165 1.217481 0.817539 -1.879420 -1.866788 1.257348 -2.040793 -2.027482 -0.929774 -0.429955 -0.408179 -0.159691
21371 1992092 12.064639 -2.316174 0.433132 0.848287 2.231699 0.041330 1.067682 -1.528655 1.481193 0.463667 0.484107 2.132478 -2.712916 0.415998 -0.482643 2.368090 1.818320 -1.835697 2.213761 1.021893 2.542915 2.152431 -2.292237 -0.564741 0.029556 -0.661431 1.259634 -1.538893 0.327645 1.036215 -2.276135 -1.075133 -1.057017 0.562307 1.955697 1.429669 1.815169 -2.713671 -1.202667 -0.206876 -2.066585 -1.486135 -2.249610 0.528683 1.248011 2.305110 0.196755 -3.015813 0.884716 1.782990 0.202502 0.363303 -2.203897 0.890204 1.972601 11.605200 -2.180668 0.633318 0.505193 2.121379 0.936419 1.221556 -2.522024 -0.246410 0.123592 0.472920 1.911681 -1.404507 -1.552859 0.102178 2.614211 3.470441 -0.910743 4.107990 0.561911 -0.386817 2.148362 -0.789333 0.490890 0.348217 1.412310 -0.776693 -3.695553 -2.537028 0.654647 0.038726 0.194863 -1.450742 -1.480179 -0.329070 -1.248945 0.135936 -0.373578 -0.008843 1.145546 -1.268382 -2.736194 -2.134065 -2.336878 0.312531 -1.267814 -0.952598 -0.567959 1.417181 -1.031334 -2.777047 -2.958610 -1.015094 -0.426489 1.111898
21372 1992093 12.055222 -2.425069 1.339212 0.274265 0.840987 0.807082 0.384775 -3.089207 -1.255036 0.706752 3.316945 2.584242 -1.555439 -0.624218 -1.494029 0.752059 2.634155 0.029098 -2.048228 -2.345172 -0.477442 0.588966 -1.810286 -0.671132 0.670969 -4.479330 0.957108 1.567778 1.821481 1.887503 -0.304292 2.229022 -0.374285 -0.629550 -0.550464 -2.488337 -2.492236 0.004280 -0.328167 -0.137174 -1.303862 -3.461748 -0.804760 -0.413108 -1.193188 -3.587590 -2.419330 0.831607 2.357133 0.281695 -2.476574 -2.795499 -0.080093 0.062647 -2.011499 12.360092 -1.868190 1.990823 0.546306 0.372803 -1.329084 1.395726 -0.722913 -0.893829 0.215497 1.165437 3.021942 -0.188859 2.311939 0.403286 -4.008986 1.662399 0.332054 0.291516 1.172819 -1.397060 1.400592 -0.140128 -0.156016 -0.683065 -1.079926 0.426619 -0.576034 -2.142184 -0.515036 -0.909491 1.934916 0.160027 0.916388 1.269362 -1.061561 -2.717686 -0.665092 -0.671323 -2.761000 -1.823349 -1.744156 -0.979761 0.332470 0.702492 -1.032879 -1.066695 -0.612696 -0.450766 0.290344 -1.642076 -3.316316 0.011961 0.375835 -3.465042
21373 1992095 11.109830 -2.853322 3.703596 -0.584121 -0.402950 -0.371390 1.595755 -1.118035 -1.088917 -0.072962 0.782687 3.074394 -0.351447 0.339711 -0.876789 0.954087 3.884860 0.323121 -0.869362 -1.514651 -1.424525 1.026468 -0.666682 -0.521699 1.419345 -2.778054 -3.370352 -0.446712 -0.978029 -0.758080 2.408722 1.658858 -2.251337 -0.820091 -0.630519 -2.159654 1.814418 0.256719 0.567236 -0.314540 -2.998560 -3.922779 0.105072 0.225998 1.222075 -1.617306 -2.339904 -0.992463 -1.548240 0.519653 -2.910805 -2.167197 0.101099 1.144445 -0.165881 12.307758 -1.719545 4.261962 0.728169 1.500838 -1.795498 2.597198 0.399761 -0.004946 -0.912142 1.097353 2.822507 -0.720652 -1.295103 -1.490739 -1.267830 3.175089 1.368422 -0.396539 -2.110602 0.679151 1.514007 -0.744474 -2.598616 -1.680172 -2.472498 1.701903 0.506631 -1.382100 -0.412576 -3.875220 1.593144 1.643325 2.150146 2.170000 -1.395402 0.464636 -0.435683 -0.975001 -1.386601 -3.586664 -2.865959 0.395006 1.476579 2.034092 -4.892234 -0.601987 1.354383 0.830095 -0.426868 -3.654878 -2.094599 1.339425 0.285083 -2.245146

21374 rows × 111 columns

  • Save the table.
df_match_player_attr_pcs.to_csv("../data/df_match_player_attr_pcs.csv", index = False)