2 - 6) EDA - Team information

12 minute read

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 
import missingno as msno 
from utils import my_histogram

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

df_team = pd.read_csv("../data/df_team.csv")

df_team

	team_api_id	team_long_name	league_name	team_measured_date	buildUpPlaySpeed	buildUpPlaySpeedClass	buildUpPlayDribbling	buildUpPlayDribblingClass	buildUpPlayPassing	buildUpPlayPassingClass	buildUpPlayPositioningClass	chanceCreationPassing	chanceCreationPassingClass	chanceCreationCrossing	chanceCreationCrossingClass	chanceCreationShooting	chanceCreationShootingClass	chanceCreationPositioningClass	defencePressure	defencePressureClass	defenceAggression	defenceAggressionClass	defenceTeamWidth	defenceTeamWidthClass	defenceDefenderLineClass
0	9930	FC Aarau	Switzerland Super League	2010-02-22	60	Balanced	NaN	Little	50	Mixed	Organised	60	Normal	65	Normal	55	Normal	Organised	50	Medium	55	Press	45	Normal	Cover
1	9930	FC Aarau	Switzerland Super League	2014-09-19	52	Balanced	48.0	Normal	56	Mixed	Organised	54	Normal	63	Normal	64	Normal	Organised	47	Medium	44	Press	54	Normal	Cover
2	9930	FC Aarau	Switzerland Super League	2015-09-10	47	Balanced	41.0	Normal	54	Mixed	Organised	54	Normal	63	Normal	64	Normal	Organised	47	Medium	44	Press	54	Normal	Cover
3	8485	Aberdeen	Scotland Premier League	2010-02-22	70	Fast	NaN	Little	70	Long	Organised	70	Risky	70	Lots	70	Lots	Organised	60	Medium	70	Double	70	Wide	Cover
4	8485	Aberdeen	Scotland Premier League	2011-02-22	47	Balanced	NaN	Little	52	Mixed	Organised	53	Normal	48	Normal	52	Normal	Organised	47	Medium	47	Press	52	Normal	Cover
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1453	10000	SV Zulte-Waregem	Belgium Jupiler League	2011-02-22	52	Balanced	NaN	Little	52	Mixed	Organised	52	Normal	48	Normal	53	Normal	Organised	46	Medium	48	Press	53	Normal	Cover
1454	10000	SV Zulte-Waregem	Belgium Jupiler League	2012-02-22	54	Balanced	NaN	Little	51	Mixed	Organised	47	Normal	52	Normal	50	Normal	Organised	44	Medium	55	Press	53	Normal	Cover
1455	10000	SV Zulte-Waregem	Belgium Jupiler League	2013-09-20	54	Balanced	NaN	Little	51	Mixed	Organised	47	Normal	52	Normal	32	Little	Organised	44	Medium	58	Press	37	Normal	Cover
1456	10000	SV Zulte-Waregem	Belgium Jupiler League	2014-09-19	54	Balanced	42.0	Normal	51	Mixed	Organised	47	Normal	52	Normal	32	Little	Organised	44	Medium	58	Press	37	Normal	Cover
1457	10000	SV Zulte-Waregem	Belgium Jupiler League	2015-09-10	54	Balanced	42.0	Normal	51	Mixed	Organised	47	Normal	52	Normal	32	Little	Organised	44	Medium	58	Press	37	Normal	Cover

1458 rows × 25 columns

team_num_attr = ["team_api_id", "team_measured_date", "buildUpPlaySpeed", "buildUpPlayDribbling", "buildUpPlayPassing", 
                 "chanceCreationPassing", "chanceCreationCrossing", "chanceCreationShooting", "defencePressure", "defenceAggression",
                 "defenceTeamWidth"]

df_team_num_attr = df_team[team_num_attr]

df_team_num_attr

	team_api_id	team_measured_date	buildUpPlaySpeed	buildUpPlayDribbling	buildUpPlayPassing	chanceCreationPassing	chanceCreationCrossing	chanceCreationShooting	defencePressure	defenceAggression	defenceTeamWidth
0	9930	2010-02-22	60	NaN	50	60	65	55	50	55	45
1	9930	2014-09-19	52	48.0	56	54	63	64	47	44	54
2	9930	2015-09-10	47	41.0	54	54	63	64	47	44	54
3	8485	2010-02-22	70	NaN	70	70	70	70	60	70	70
4	8485	2011-02-22	47	NaN	52	53	48	52	47	47	52
...	...	...	...	...	...	...	...	...	...	...	...
1453	10000	2011-02-22	52	NaN	52	52	48	53	46	48	53
1454	10000	2012-02-22	54	NaN	51	47	52	50	44	55	53
1455	10000	2013-09-20	54	NaN	51	47	52	32	44	58	37
1456	10000	2014-09-19	54	42.0	51	47	52	32	44	58	37
1457	10000	2015-09-10	54	42.0	51	47	52	32	44	58	37

1458 rows × 11 columns

df_team_num_attr.isna().sum()

team_api_id                 0
team_measured_date          0
buildUpPlaySpeed            0
buildUpPlayDribbling      969
buildUpPlayPassing          0
chanceCreationPassing       0
chanceCreationCrossing      0
chanceCreationShooting      0
defencePressure             0
defenceAggression           0
defenceTeamWidth            0
dtype: int64

Since there are missing values in only the buildUpPlayDribbling column, let’s just drop the column.

df_team_num_attr.drop("buildUpPlayDribbling", axis = 1, inplace = True)

Since there are only 2 teams in each match, there will be total 2 teams x 10 team attributes = 20 columns for each match.
So the original team attributes columns will be used for modeling unlike the player attribute columns that will be used after dimensionality reduction by PCA.

team_num_collected = pd.DataFrame(df_team_num_attr.groupby("team_api_id").count().team_measured_date)
team_num_collected.columns = ["num_collected"]
team_num_collected

	num_collected
team_api_id
1601	6
1773	3
1957	6
2033	5
2182	6
...	...
10281	6
108893	6
158085	2
208931	2
274581	1

288 rows × 1 columns

team_num_collected.describe()

	num_collected
count	288.000000
mean	5.062500
std	1.540255
min	1.000000
25%	5.000000
50%	6.000000
75%	6.000000
max	6.000000

my_histogram(team_num_collected, "num_collected")

png

For most teams, team attributes were collected 5 to 6 times.
So, we will use team attributes collected closest to match dates.

df_match_basic = pd.read_csv("../data/df_match_basic.csv")

df_match_team_num_attr = df_match_basic[["match_api_id", "match_date", "home_team_api_id", "away_team_api_id"]]

home_team_info = df_match_team_num_attr.merge(df_team_num_attr, how = "left", left_on = "home_team_api_id", right_on = "team_api_id") \
                                       .sort_values(["team_api_id", "match_date", "team_measured_date"])

home_team_info["date_diff"] = (pd.to_datetime(home_team_info.team_measured_date) - pd.to_datetime(home_team_info.match_date)).dt.days
home_team_info = home_team_info[home_team_info.date_diff < 0]

home_team_info = home_team_info.sort_values(["team_api_id", "match_date", "date_diff"], ascending = [True, True, False]).groupby(["team_api_id", "match_date"]).first()

home_team_info = home_team_info.drop(["home_team_api_id", "away_team_api_id", "team_measured_date", "date_diff"], axis = 1) \
                    .rename(columns = {"buildUpPlaySpeed": "home_team_buildUpPlaySpeed", "buildUpPlayPassing": "home_team_buildUpPlayPassing",
                                       "chanceCreationPassing": "home_team_chanceCreationPassing", "chanceCreationCrossing": "home_team_chanceCreationCrossing",
                                       "chanceCreationShooting": "home_team_chanceCreationShooting", "defencePressure": "home_team_defencePressure", 
                                       "defenceAggression": "home_team_defenceAggression", "defenceTeamWidth": "home_team_defenceTeamWidth"})

away_team_info = df_match_team_num_attr.merge(df_team_num_attr, how = "left", left_on = "away_team_api_id", right_on = "team_api_id") \
                                       .sort_values(["team_api_id", "match_date", "team_measured_date"])

away_team_info["date_diff"] = (pd.to_datetime(away_team_info.team_measured_date) - pd.to_datetime(away_team_info.match_date)).dt.days
away_team_info = away_team_info[away_team_info.date_diff < 0]

away_team_info = away_team_info.sort_values(["team_api_id", "match_date", "date_diff"], ascending = [True, True, False]).groupby(["team_api_id", "match_date"]).first()

away_team_info = away_team_info.drop(["home_team_api_id", "away_team_api_id", "team_measured_date", "date_diff"], axis = 1) \
                    .rename(columns = {"buildUpPlaySpeed": "away_team_buildUpPlaySpeed", "buildUpPlayPassing": "away_team_buildUpPlayPassing",
                                       "chanceCreationPassing": "away_team_chanceCreationPassing", "chanceCreationCrossing": "away_team_chanceCreationCrossing",
                                       "chanceCreationShooting": "away_team_chanceCreationShooting", "defencePressure": "away_team_defencePressure", 
                                       "defenceAggression": "away_team_defenceAggression", "defenceTeamWidth": "away_team_defenceTeamWidth"})

home_team_info

		match_api_id	home_team_buildUpPlaySpeed	home_team_buildUpPlayPassing	home_team_chanceCreationPassing	home_team_chanceCreationCrossing	home_team_chanceCreationShooting	home_team_defencePressure	home_team_defenceAggression	home_team_defenceTeamWidth
team_api_id	match_date
1601.0	2010-03-06	674613	30.0	40.0	50.0	35.0	70.0	65.0	60.0	50.0
	2010-03-19	674631	30.0	40.0	50.0	35.0	70.0	65.0	60.0	50.0
	2010-04-03	674641	30.0	40.0	50.0	35.0	70.0	65.0	60.0	50.0
	2010-04-20	674839	30.0	40.0	50.0	35.0	70.0	65.0	60.0	50.0
	2010-04-28	674854	30.0	40.0	50.0	35.0	70.0	65.0	60.0	50.0
...	...	...	...	...	...	...	...	...	...	...
274581.0	2015-12-22	1979948	50.0	50.0	50.0	50.0	50.0	45.0	45.0	50.0
	2016-01-23	1980004	50.0	50.0	50.0	50.0	50.0	45.0	45.0	50.0
	2016-02-06	1980020	50.0	50.0	50.0	50.0	50.0	45.0	45.0	50.0
	2016-02-20	1980036	50.0	50.0	50.0	50.0	50.0	45.0	45.0	50.0
	2016-03-05	1980052	50.0	50.0	50.0	50.0	50.0	45.0	45.0	50.0

19913 rows × 9 columns

df_match_team_num_attr = df_match_team_num_attr.merge(home_team_info, how = "left", on = "match_api_id") \
                                               .merge(away_team_info, how = "left", on = "match_api_id") \
                                               .drop(["match_date", "home_team_api_id", "away_team_api_id"], axis = 1)

df_match_team_num_attr

	match_api_id	home_team_buildUpPlaySpeed	home_team_buildUpPlayPassing	home_team_chanceCreationPassing	home_team_chanceCreationCrossing	home_team_chanceCreationShooting	home_team_defencePressure	home_team_defenceAggression	home_team_defenceTeamWidth	away_team_buildUpPlaySpeed	away_team_buildUpPlayPassing	away_team_chanceCreationPassing	away_team_chanceCreationCrossing	away_team_chanceCreationShooting	away_team_defencePressure	away_team_defenceAggression	away_team_defenceTeamWidth
0	492473	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	492474	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	492475	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	492476	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	492477	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25974	1992091	52.0	48.0	39.0	39.0	37.0	40.0	36.0	52.0	58.0	35.0	64.0	36.0	66.0	47.0	45.0	53.0
25975	1992092	53.0	56.0	38.0	53.0	46.0	42.0	33.0	58.0	63.0	51.0	54.0	40.0	66.0	48.0	53.0	50.0
25976	1992093	50.0	54.0	52.0	54.0	46.0	36.0	43.0	47.0	61.0	37.0	62.0	50.0	53.0	45.0	47.0	54.0
25977	1992094	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	62.0	46.0	47.0	50.0	54.0	47.0	43.0	56.0
25978	1992095	52.0	64.0	39.0	66.0	46.0	44.0	34.0	50.0	61.0	46.0	66.0	66.0	54.0	47.0	58.0	53.0

25979 rows × 17 columns

Save the table.

df_match_team_num_attr.to_csv("../data/df_match_team_num_attr.csv", index = False)

Youngjun Woo