Spaces:
Running
Running
from nfl_data_py import nfl_data_py as nfl | |
from tqdm import tqdm | |
import numpy as np | |
import pandas as pd | |
pd.set_option('chained_assignment',None) | |
pd.set_option('display.max_columns',None) | |
import os | |
import datetime as dt | |
current_directory = os.path.dirname(os.path.abspath(__file__)) | |
parent_directory = os.path.dirname(current_directory) | |
data_directory = os.path.join(parent_directory, 'Data') | |
year = dt.datetime.now().year | |
month = dt.datetime.now().month | |
current_season = year if month in [8,9,10,11,12] else year-1 | |
def get_pbp_data(get_seasons=[]): | |
""" | |
Pull data from nflFastR's Github repo. | |
""" | |
pbp = nfl.import_pbp_data(get_seasons) | |
#pbp = pd.read_csv(r"C:\Users\brayd\Downloads\play_by_play_2023.csv") | |
pbp['TOP_seconds'] = pbp['drive_time_of_possession'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if pd.notnull(x) else 0) | |
return pbp | |
def build_gbg_data(get_seasons=[]): | |
""" | |
Build a game-by-game dataset to use for prediction models. | |
""" | |
print('Loading play-by-play data.') | |
pbp = get_pbp_data(get_seasons) | |
game_date_dict = dict(pbp[['game_id','game_date']].values) | |
teams = list(set(list(pbp['home_team'].unique()) + list(pbp['away_team'].unique()))) | |
seasons = pbp['season'].unique() | |
print('Building game-by-game data.') | |
data = pd.DataFrame() | |
for season in seasons: | |
print(season) | |
for team_name in tqdm(teams): | |
# create features | |
team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)] | |
team['GP'] = team['week'] | |
team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values] | |
team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values] | |
team['W_PCT'] = team['W']/team['GP'] | |
team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values] | |
team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values] | |
team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values] | |
team['FG_PCT'] = team['FGM']/team['FGA'] | |
team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0) | |
team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0) | |
team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0) | |
team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0) | |
team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values] | |
team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values] | |
team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values] | |
team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values] | |
team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0) | |
team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0) | |
team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0) | |
team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0) | |
team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0) | |
team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0) | |
team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0) | |
team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values] | |
team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values] | |
team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values] | |
team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values] | |
team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values] | |
team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values] | |
team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values] | |
# aggregate from play-by-play to game-by-game | |
features = { | |
'GP':'mean', | |
'W':'mean', | |
'L':'mean', | |
'W_PCT':'mean', | |
'TOP':'sum', | |
'FGA':'sum', | |
'FGM':'sum', | |
'FG_PCT':'mean', | |
'PassTD':'sum', | |
'RushTD':'sum', | |
'PassTD_Allowed':'sum', | |
'RushTD_Allowed':'sum', | |
'PassYds':'sum', | |
'RushYds':'sum', | |
'PassYds_Allowed':'sum', | |
'RushYds_Allowed':'sum', | |
'Fum':'sum', | |
'Fum_Allowed':'sum', | |
'INT':'sum', | |
'INT_Allowed':'sum', | |
'Sacks':'sum', | |
'Sacks_Allowed':'sum', | |
'Penalties':'sum', | |
'FirstDowns':'sum', | |
'3rdDownConverted':'sum', | |
'3rdDownFailed':'sum', | |
'3rdDownAllowed':'sum', | |
'3rdDownDefended':'sum', | |
'PTS':'mean', | |
'PointDiff':'mean' | |
} | |
game = team.groupby('game_id').agg(features).reset_index().sort_values('GP') | |
game[['W','L']] = game[['W','L']].expanding().sum() | |
game[game.columns[4:]] = game[game.columns[4:]].expanding().mean() | |
if season != current_season: | |
game[game.columns[1:]] = game[game.columns[1:]].shift() | |
game['TEAM'] = team_name | |
game['Season'] = season | |
else: | |
game['TEAM'] = team_name | |
game['Season'] = season | |
data = pd.concat([data,game]) | |
# separate home and away data and merge | |
data = data.merge(pbp[['game_id','home_team','away_team']].drop_duplicates()) | |
home = data.loc[data['home_team']==data['TEAM']] | |
away = data.loc[data['away_team']==data['TEAM']] | |
away.columns = [f'{i}.Away' for i in away.columns] | |
gbg = home.merge(away,left_on='game_id',right_on='game_id.Away') | |
gbg.drop(columns=['TEAM','TEAM.Away','home_team.Away','away_team.Away','Season.Away','game_id.Away'], inplace=True) | |
gbg['game_date'] = gbg['game_id'].map(game_date_dict) | |
# save current data | |
if current_season in get_seasons: | |
gbg_this_year = gbg.loc[gbg['Season']==current_season] | |
file_path = os.path.join(data_directory, 'gbg_this_year.csv') | |
gbg_this_year.to_csv(file_path, index=False) | |
# save historical data | |
if get_seasons != [current_season]: | |
gbg = gbg.loc[gbg['Season']!=current_season] | |
file_path = os.path.join(data_directory, 'gbg.csv') | |
gbg.to_csv(file_path, index=False) | |
def add_odds_data(): | |
""" | |
Get odds from Australian Sports Betting's free online dataset and merge it with game-by-game data. | |
""" | |
# get team abbreviations | |
team_descriptions = nfl.import_team_desc() | |
team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values) | |
# get odds | |
odds = pd.read_excel('https://www.aussportsbetting.com/historical_data/nfl.xlsx') | |
odds['Home Team'] = odds['Home Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders') | |
odds['Away Team'] = odds['Away Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders') | |
odds['Season'] = [i.year if i.month in [8,9,10,11,12] else i.year-1 for i in odds['Date']] | |
odds['Home Team Abbrev'] = odds['Home Team'].map(team_abbreviation_dict).str.replace('LAR','LA') | |
odds['Away Team Abbrev'] = odds['Away Team'].map(team_abbreviation_dict).str.replace('LAR','LA') | |
odds = odds[['Date','Home Score','Away Score','Home Team Abbrev','Away Team Abbrev','Home Odds Close','Away Odds Close','Total Score Close','Home Line Close']] | |
odds['Key'] = odds['Date'].astype(str) + odds['Home Team Abbrev'] + odds['Away Team Abbrev'] | |
odds = odds.drop(columns=['Date','Home Team Abbrev','Away Team Abbrev']).dropna() | |
odds['Home Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Home Odds Close']] | |
odds['Away Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Away Odds Close']] | |
odds['Home Winnings'] = [ho-1 if h>a else -1 if a>h else 0 for ho,h,a in odds[['Home Odds Close','Home Score','Away Score']].values] | |
odds['Away Winnings'] = [ao-1 if a>h else -1 if h>a else 0 for ao,h,a in odds[['Away Odds Close','Home Score','Away Score']].values] | |
# load gbg data | |
file_path = os.path.join(data_directory, 'gbg.csv') | |
gbg = pd.read_csv(file_path) | |
file_path = os.path.join(data_directory, 'gbg_this_year.csv') | |
gbg_this_year = pd.read_csv(file_path) | |
# merge and save | |
dataframes = [gbg, gbg_this_year] | |
for idx in range(2): | |
i = dataframes[idx] | |
i['Key'] = i['game_date'].astype(str) + i['home_team'] + i['away_team'] | |
gbg_and_odds = i.merge(odds, left_on='Key', right_on='Key') | |
gbg_and_odds['Home-Team-Cover'] = [1 if (h-a)>-l else 0 if (h-a)<-l else 2 for h,a,l in gbg_and_odds[['Home Score','Away Score','Home Line Close']].values] | |
gbg_and_odds['Home-Team-Win'] = (gbg_and_odds['Home Score']>gbg_and_odds['Away Score']).astype(int) | |
gbg_and_odds['Over'] = ((gbg_and_odds['Home Score'] + gbg_and_odds['Away Score'])>gbg_and_odds['Total Score Close']).astype(int) | |
if idx==0: | |
file_path = os.path.join(data_directory, 'gbg_and_odds.csv') | |
else: | |
file_path = os.path.join(data_directory, 'gbg_and_odds_this_year.csv') | |
gbg_and_odds.drop_duplicates(subset='game_id').to_csv(file_path, index=False) | |