Spaces:
Running
Running
| from bs4 import BeautifulSoup | |
| import datetime | |
| from multiprocessing import Pool | |
| import numpy as np | |
| import pandas as pd | |
| from pydantic import BaseModel, Field | |
| import requests | |
| from typing import Optional | |
| from urllib.parse import urljoin | |
| from domain.teams import ALL_TEAMS, NFLTeam | |
| MULTIPROCESSING_ENABLED = False | |
| PRACTICE_WEEK = { | |
| "Mon": 0, | |
| "Tue": 1, | |
| "Wed": 2, | |
| "Thu": 3, | |
| "Fri": 4, | |
| "Sat": 5, | |
| "Sun": 6, | |
| "Monday": 0, | |
| "Tuesday": 1, | |
| "Wednesday": 2, | |
| "Thursday": 3, | |
| "Friday": 4, | |
| "Saturday": 5, | |
| "Sunday": 6, | |
| } | |
| DAY_OF_WEEK_STRING_MAPPING = { | |
| "Monday": "Mon", | |
| "Tuesday": "Tue", | |
| "Wednesday": "Wed", | |
| "Thursday": "Thu", | |
| "Friday": "Fri", | |
| "Saturday": "Sat", | |
| "Sunday": "Sun", | |
| } | |
| WEEK_1_BEGIN_DATE = datetime.datetime(2025, 9, 1) | |
| CURRENT_DATE = datetime.datetime.now() | |
| CURRENT_WEEK = max(1, int(1 + (CURRENT_DATE - WEEK_1_BEGIN_DATE).days / 7)) | |
| CURRENT_SEASON = 2025 | |
| class PracticeReportRawRow(BaseModel): | |
| Team: str | |
| Player: str | |
| Position: str | |
| Injury: str | |
| Sun: Optional[str] = None | |
| Mon: Optional[str] = None | |
| Tue: Optional[str] = None | |
| Wed: Optional[str] = None | |
| Thu: Optional[str] = None | |
| Fri: Optional[str] = None | |
| Sat: Optional[str] = None | |
| game_status: str = Field(alias="Game Status") | |
| def replace_nan(self, value) -> str: | |
| if isinstance(value, float): | |
| if np.isnan(value): | |
| return "" | |
| return value | |
| def from_raw(cls, input_dict) -> "PracticeReportRawRow": | |
| return cls(**{DAY_OF_WEEK_STRING_MAPPING.get(k, k): cls.replace_nan(v) for k, v in input_dict.items()}) | |
| def get_injury_report_dataframe(team: NFLTeam): | |
| injury_report_url = urljoin(team.injury_report_url, f"week/REG-{CURRENT_WEEK}") | |
| report_request = requests.get(injury_report_url) | |
| report_soup = BeautifulSoup(report_request.content) | |
| team_names_spans = report_soup.find_all("span", {"class": "nfl-o-injury-report__club-name"}) | |
| assert team_names_spans | |
| team_names_str = [x.get_text() for x in team_names_spans] | |
| assert team_names_str[0] == team.team_full_name | |
| tables = report_soup.find_all("table") | |
| df_report = pd.read_html(str(tables))[0] | |
| return df_report | |
| def scrape_team_injury_report(team: NFLTeam) -> pd.DataFrame: | |
| print(f"Scraping Injury Report for: {team.team_full_name}") | |
| try: | |
| team_report = get_injury_report_dataframe(team) | |
| except Exception: | |
| print(f"Failed to scrape practice report for: {team.team_full_name}") | |
| return pd.DataFrame() | |
| validated_row_list = [] | |
| for df_row_dict in team_report.to_dict("records"): | |
| row_to_add = df_row_dict | |
| row_to_add["Team"] = team.team_full_name | |
| validated_row_list.append(PracticeReportRawRow.from_raw(row_to_add)) | |
| validated_df = pd.DataFrame([x.dict() for x in validated_row_list]) | |
| # drop all na columns | |
| validated_df.dropna(axis=1, how="all", inplace=True) | |
| # replace day of week with practice day from 1-3 | |
| day_idx = 1 | |
| last_practice_day = None | |
| for col in validated_df.columns: | |
| if col in PRACTICE_WEEK: | |
| validated_df.rename(columns={col: str(day_idx)}, inplace=True) | |
| day_idx += 1 | |
| last_practice_day = col | |
| validated_df["Last Practice Day"] = last_practice_day | |
| return validated_df | |
| def scrape_all_team_injury_report() -> pd.DataFrame: | |
| if MULTIPROCESSING_ENABLED: | |
| with Pool() as pool: | |
| team_df_list = pool.map(scrape_team_injury_report, ALL_TEAMS) | |
| else: | |
| team_df_list = [scrape_team_injury_report(team) for team in ALL_TEAMS] | |
| return pd.concat(team_df_list) | |