[attach]145643[/attach]
保存清洗过后的数据
df_historical_data.to_csv('clean_fifa_worldcup_matches.csv',index=False)
df_fixture.to_csv('clean_fifa_worldcup_fixture.csv',index=False)
数据可视化
# nation_position, club_position, player_positions
df = pd.read_csv('players_22.csv', low_memory=False)
# 选择需要用的列
df = df[['short_name', 'age', 'nationality_name', 'overall', 'potential',
'club_name', 'value_eur', 'wage_eur', 'player_positions']]
# 只选择一个position
df['player_positions'] = df['player_positions'].str.split(',', expand=True)[0]
# 删除缺失值
df.dropna(inplace=True)
players_missing_worldcup = ['K. Benzema', 'S. Mané', 'S. Agüero', 'Sergio Ramos',
'P. Pogba', 'M. Reus', 'Diogo Jota', 'A. Harit',
'N. Kanté', 'G. Lo Celso', 'Piqué']
# 删除受伤的球员
drop_index = df[df['short_name'].isin(players_missing_worldcup)].index
df.drop(drop_index, axis=0, inplace=True)
teams_worldcup = [
'Qatar', 'Brazil', 'Belgium', 'France', 'Argentina', 'England', 'Spain', 'Portugal',
'Mexico', 'Netherlands', 'Denmark', 'Germany', 'Uruguay', 'Switzerland', 'United States', 'Croatia',
'Senegal', 'Iran', 'Japan', 'Morocco', 'Serbia', 'Poland', 'South Korea', 'Tunisia',
'Cameroon', 'Canada', 'Ecuador', 'Saudi Arabia', 'Ghana', 'Wales', 'Costa Rica', 'Australia'
]
# 筛选国家队
df = df[df['nationality_name'].isin(teams_worldcup)]
# 最佳球员
df.sort_values(by=['overall', 'potential', 'value_eur'], ascending=False, inplace=True)
球员分布
import numpy as np
fig, ax = plt.subplots(figsize=(12, 5), tight_layout=True)
sns.histplot(df, x='overall', binwidth=1)
bins = np.arange(df['overall'].min(), df['overall'].max(), 1)
plt.xticks(bins)
plt.show()
[attach]145644[/attach]
世界杯梦之队球员
df.drop_duplicates('player_positions')
[attach]145645[/attach]
每个国家队中最有技能的球员
df_best_players = df.copy()
df_best_players = df_best_players.drop_duplicates('nationality_name').reset_index(drop=True)
country_short = df_best_players['nationality_name'].str.extract('(^\w{3})', expand=False).str.upper()
df_best_players['name_nationality'] = df_best_players['short_name'] +' (' + country_short + ')'
fig, ax = plt.subplots(figsize=(10, 6), tight_layout=True)
sns.barplot(df_best_players, x='overall', y='name_nationality',
palette=sns.color_palette('pastel'), width=0.5)
plt.show()
[attach]145646[/attach]
每支球队的最佳阵容
def best_squad(nationality):
df_best_squad = df.copy()
df_best_squad = df_best_squad.groupby(['nationality_name', 'player_positions']).head(2)
df_best_squad = df_best_squad[df_best_squad['nationality_name']==nationality].sort_values(['player_positions', 'overall', 'potential'], ascending=False)
return df_best_squad
best_squad('Brazil')
[attach]145647[/attach]
average_overall = [best_squad(team)['overall'].mean() for team in teams_worldcup]
df_average_overall = pd.DataFrame({'Teams': teams_worldcup, 'AVG_Overall': average_overall})
df_average_overall = df_average_overall.dropna()
df_average_overall = df_average_overall.sort_values('AVG_Overall', ascending=False)
df_average_overall
[attach]145648[/attach]
fig, ax = plt.subplots(figsize=(12, 5), tight_layout=True)
sns.barplot(df_average_overall[:10], x='Teams', y='AVG_Overall',
palette=sns.color_palette('pastel'))
plt.show()
[attach]145649[/attach]