marci / Source /Build /build.py
BraydenMoore's picture
uhhh
4774d56
raw
history blame
10.9 kB
from nfl_data_py import nfl_data_py as nfl
from tqdm import tqdm
import numpy as np
import pandas as pd
pd.set_option('chained_assignment',None)
pd.set_option('display.max_columns',None)
import os
import datetime as dt
current_directory = os.path.dirname(os.path.abspath(__file__))
parent_directory = os.path.dirname(current_directory)
data_directory = os.path.join(parent_directory, 'Data')
year = dt.datetime.now().year
month = dt.datetime.now().month
current_season = year if month in [8,9,10,11,12] else year-1
def get_pbp_data(get_seasons=[]):
"""
Pull data from nflFastR's Github repo.
"""
pbp = nfl.import_pbp_data(get_seasons)
#pbp = pd.read_csv(r"C:\Users\brayd\Downloads\play_by_play_2023.csv")
pbp['TOP_seconds'] = pbp['drive_time_of_possession'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]) if pd.notnull(x) else 0)
return pbp
def build_gbg_data(get_seasons=[]):
"""
Build a game-by-game dataset to use for prediction models.
"""
print('Loading play-by-play data.')
pbp = get_pbp_data(get_seasons)
game_date_dict = dict(pbp[['game_id','game_date']].values)
teams = list(set(list(pbp['home_team'].unique()) + list(pbp['away_team'].unique())))
seasons = pbp['season'].unique()
print('Building game-by-game data.')
data = pd.DataFrame()
for season in seasons:
print(season)
for team_name in tqdm(teams):
# create features
team = pbp.loc[((pbp['home_team']==team_name) | (pbp['away_team']==team_name)) & (pbp['season']==season)]
team['GP'] = team['week']
team['W'] = [1 if r>0 and team_name==h else 1 if r<0 and team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
team['L'] = [0 if r>0 and team_name==h else 0 if r<0 and team_name==a else 1 for r,a,h in team[['result','away_team','home_team']].values]
team['W_PCT'] = team['W']/team['GP']
team['TOP'] = [t if team_name==p else 0 for t,p in team[['TOP_seconds','posteam']].values]
team['FGA'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','field_goal_attempt']].values]
team['FGM'] = [1 if team_name==p and f=='made' else 0 for p,f in team[['posteam','field_goal_result']].values]
team['FG_PCT'] = team['FGM']/team['FGA']
team['PassTD'] = np.where((team['posteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD'] = np.where((team['posteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['pass_touchdown'] == 1), 1, 0)
team['RushTD_Allowed'] = np.where((team['defteam'] == team_name) & (team['rush_touchdown'] == 1), 1, 0)
team['PassYds'] = [y if p==team_name else 0 for p,y in team[['posteam','passing_yards']].values]
team['RushYds'] = [y if p==team_name else 0 for p,y in team[['posteam','rushing_yards']].values]
team['PassYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','passing_yards']].values]
team['RushYds_Allowed'] = [y if d==team_name else 0 for d,y in team[['defteam','rushing_yards']].values]
team['Fum'] = np.where((team['defteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['Fum_Allowed'] = np.where((team['posteam'] == team_name) & (team['fumble_lost'] == 1), 1, 0)
team['INT'] = np.where((team['defteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['INT_Allowed'] = np.where((team['posteam'] == team_name) & (team['interception'] == 1), 1, 0)
team['Sacks'] = np.where((team['defteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Sacks_Allowed'] = np.where((team['posteam'] == team_name) & (team['sack'] == 1), 1, 0)
team['Penalties'] = np.where((team['penalty_team'] == team_name), 1, 0)
team['FirstDowns'] = [1 if team_name==p and f==1 else 0 for p,f in team[['posteam','first_down']].values]
team['3rdDownConverted'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_converted']].values]
team['3rdDownFailed'] = [1 if p==team_name and t==1 else 0 for p,t in team[['posteam','third_down_failed']].values]
team['3rdDownAllowed'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_converted']].values]
team['3rdDownDefended'] = [1 if d==team_name and t==1 else 0 for d,t in team[['defteam','third_down_failed']].values]
team['PTS'] = [ap if at==team_name else hp if ht==team_name else None for ht,at,hp,ap in team[['home_team','away_team','home_score','away_score']].values]
team['PointDiff'] = [r if team_name==h else -r if team_name==a else 0 for r,a,h in team[['result','away_team','home_team']].values]
# aggregate from play-by-play to game-by-game
features = {
'GP':'mean',
'W':'mean',
'L':'mean',
'W_PCT':'mean',
'TOP':'sum',
'FGA':'sum',
'FGM':'sum',
'FG_PCT':'mean',
'PassTD':'sum',
'RushTD':'sum',
'PassTD_Allowed':'sum',
'RushTD_Allowed':'sum',
'PassYds':'sum',
'RushYds':'sum',
'PassYds_Allowed':'sum',
'RushYds_Allowed':'sum',
'Fum':'sum',
'Fum_Allowed':'sum',
'INT':'sum',
'INT_Allowed':'sum',
'Sacks':'sum',
'Sacks_Allowed':'sum',
'Penalties':'sum',
'FirstDowns':'sum',
'3rdDownConverted':'sum',
'3rdDownFailed':'sum',
'3rdDownAllowed':'sum',
'3rdDownDefended':'sum',
'PTS':'mean',
'PointDiff':'mean'
}
game = team.groupby('game_id').agg(features).reset_index().sort_values('GP')
game[['W','L']] = game[['W','L']].expanding().sum()
game[game.columns[4:]] = game[game.columns[4:]].expanding().mean()
if season != current_season:
game[game.columns[1:]] = game[game.columns[1:]].shift()
game['TEAM'] = team_name
game['Season'] = season
else:
game['TEAM'] = team_name
game['Season'] = season
data = pd.concat([data,game])
# separate home and away data and merge
data = data.merge(pbp[['game_id','home_team','away_team']].drop_duplicates())
home = data.loc[data['home_team']==data['TEAM']]
away = data.loc[data['away_team']==data['TEAM']]
away.columns = [f'{i}.Away' for i in away.columns]
gbg = home.merge(away,left_on='game_id',right_on='game_id.Away')
gbg.drop(columns=['TEAM','TEAM.Away','home_team.Away','away_team.Away','Season.Away','game_id.Away'], inplace=True)
gbg['game_date'] = gbg['game_id'].map(game_date_dict)
# save current data
if current_season in get_seasons:
gbg_this_year = gbg.loc[gbg['Season']==current_season]
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year.to_csv(file_path, index=False)
# save historical data
if get_seasons != [current_season]:
gbg = gbg.loc[gbg['Season']!=current_season]
file_path = os.path.join(data_directory, 'gbg.csv')
gbg.to_csv(file_path, index=False)
def add_odds_data():
"""
Get odds from Australian Sports Betting's free online dataset and merge it with game-by-game data.
"""
# get team abbreviations
team_descriptions = nfl.import_team_desc()
team_abbreviation_dict = dict(team_descriptions[['team_name','team_abbr']].values)
# get odds
odds = pd.read_excel('https://www.aussportsbetting.com/historical_data/nfl.xlsx')
odds['Home Team'] = odds['Home Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Away Team'] = odds['Away Team'].str.replace('Washington Redskins','Washington Commanders').str.replace('Washington Football Team','Washington Commanders')
odds['Season'] = [i.year if i.month in [8,9,10,11,12] else i.year-1 for i in odds['Date']]
odds['Home Team Abbrev'] = odds['Home Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds['Away Team Abbrev'] = odds['Away Team'].map(team_abbreviation_dict).str.replace('LAR','LA')
odds = odds[['Date','Home Score','Away Score','Home Team Abbrev','Away Team Abbrev','Home Odds Close','Away Odds Close','Total Score Close','Home Line Close']]
odds['Key'] = odds['Date'].astype(str) + odds['Home Team Abbrev'] + odds['Away Team Abbrev']
odds = odds.drop(columns=['Date','Home Team Abbrev','Away Team Abbrev']).dropna()
odds['Home Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Home Odds Close']]
odds['Away Odds'] = [round((i-1)*100) if i>= 2 else round(-100/(i-1)) for i in odds['Away Odds Close']]
odds['Home Winnings'] = [ho-1 if h>a else -1 if a>h else 0 for ho,h,a in odds[['Home Odds Close','Home Score','Away Score']].values]
odds['Away Winnings'] = [ao-1 if a>h else -1 if h>a else 0 for ao,h,a in odds[['Away Odds Close','Home Score','Away Score']].values]
# load gbg data
file_path = os.path.join(data_directory, 'gbg.csv')
gbg = pd.read_csv(file_path)
file_path = os.path.join(data_directory, 'gbg_this_year.csv')
gbg_this_year = pd.read_csv(file_path)
# merge and save
dataframes = [gbg, gbg_this_year]
for idx in range(2):
i = dataframes[idx]
i['Key'] = i['game_date'].astype(str) + i['home_team'] + i['away_team']
gbg_and_odds = i.merge(odds, left_on='Key', right_on='Key')
gbg_and_odds['Home-Team-Cover'] = [1 if (h-a)>-l else 0 if (h-a)<-l else 2 for h,a,l in gbg_and_odds[['Home Score','Away Score','Home Line Close']].values]
gbg_and_odds['Home-Team-Win'] = (gbg_and_odds['Home Score']>gbg_and_odds['Away Score']).astype(int)
gbg_and_odds['Over'] = ((gbg_and_odds['Home Score'] + gbg_and_odds['Away Score'])>gbg_and_odds['Total Score Close']).astype(int)
if idx==0:
file_path = os.path.join(data_directory, 'gbg_and_odds.csv')
else:
file_path = os.path.join(data_directory, 'gbg_and_odds_this_year.csv')
gbg_and_odds.drop_duplicates(subset='game_id').to_csv(file_path, index=False)