Skip to content

Player contracts parsing #315

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: player-contracts
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions basketball_reference_web_scraper/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import requests

from basketball_reference_web_scraper.contracts.data.models import PlayerContract
from basketball_reference_web_scraper.contracts.data.parsers import create_player_contract
from basketball_reference_web_scraper.contracts.data.parsers import PlayerContractParser, \
SalariesBySeasonParser, deserialize_season_start_year, deserialize_guaranteed_salary, \
deserialize_optional_salary, deserialize_team
from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason
from basketball_reference_web_scraper.http_service import HTTPService
from basketball_reference_web_scraper.output.columns import BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, \
Expand All @@ -16,6 +18,14 @@
SearchCSVWriter
from basketball_reference_web_scraper.parser_service import ParserService

player_contract_parser = PlayerContractParser(
salary_generator=SalariesBySeasonParser(season_start_year_deserializer=deserialize_season_start_year,
salary_deserializer=deserialize_optional_salary),
guaranteed_salary_generator=deserialize_guaranteed_salary,
player_generator=lambda row: Player(identifier=row.id, name=row.name),
team_generator=deserialize_team,
)


def standings(season_end_year, output_type=None, output_file_path=None, output_write_option=None,
json_options=None):
Expand Down Expand Up @@ -257,6 +267,17 @@ def search(term, output_type=None, output_file_path=None, output_write_option=No


def player_contracts(player_contract_processor: Callable[[PlayerContract], Any]):
"""
Parses player contract data found on this page: https://www.basketball-reference.com/contracts/players.html
PlayerContract results are processed by the client caller via a callback.

For example:
client.player_contracts(player_contract_processor=lambda player_contract_data: print(player_contract_data))

:param player_contract_processor:
:return: None
:raises: CouldNotGetPlayerContractData
"""
HTTPService(parser=ParserService()).player_contracts(
player_contract_processor=lambda player_row_contract_data: player_contract_processor(
create_player_contract(player_row_contract_data)))
player_contract_parser.parse_table_data(data=player_row_contract_data)))
123 changes: 79 additions & 44 deletions basketball_reference_web_scraper/contracts/data/parsers.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,94 @@
from datetime import datetime
from typing import Dict
from typing import Dict, Optional

from coverage.types import Protocol
from price_parser import Price

from basketball_reference_web_scraper.contracts.data.models import Salary, PlayerContract, Player
from basketball_reference_web_scraper.contracts.page.parsers import PlayerRowData, PlayerContractData
from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM
from basketball_reference_web_scraper.contracts.page.parsers import PlayerContractData as PlayerContractTableData, \
PlayerRowData
from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM, Team

GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE = "remain_gtd"

from typing import Callable

class PlayerContractRowDataProcessor(Protocol):
def process_row(self, headers: Dict[str, str], row_data: PlayerRowData) -> PlayerContract:
raise NotImplementedError()

class SalariesBySeasonParser:
def __init__(self, season_start_year_deserializer: Callable[[str], int],
salary_deserializer: Callable[[Optional[str]], Optional[Salary]]):
self._season_start_year_deserializer = season_start_year_deserializer
self._salary_deserializer = salary_deserializer

def parse_season_start_date(serialized_season: str) -> int:
return datetime.strptime(serialized_season, "%Y-%y").year
def parse(self, contract_values_by_column_identifier: Dict[str, str],
column_names_by_identifier: Dict[str, str]) -> Dict[int, Optional[Salary]]:
return dict(
map(lambda item: (item[0], self._salary_deserializer(item[1])),
map(lambda column_name_and_value: (
self._season_start_year_deserializer(column_name_and_value[0]), column_name_and_value[1]),
map(
lambda season_salary_columns_by_value: (
column_names_by_identifier[season_salary_columns_by_value[0]],
season_salary_columns_by_value[1]),
filter(
lambda value_and_column_identifier:
value_and_column_identifier[0] != GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE,
contract_values_by_column_identifier.items())))))


def parse_player_contract_values(contract_values_by_column_identifier: Dict[str, str],
column_names_by_identifier: Dict[str, str]):
guaranteed_salary_value = contract_values_by_column_identifier.get(GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, None)
if guaranteed_salary_value:
parsed_guaranteed_salary = Price.fromstring(price=guaranteed_salary_value)

return (
dict(map(lambda item: (
item[0], None if item[1] is None else Salary(amount=item[1].amount, currency=item[1].currency)),
map(lambda item: (item[0], None if item[1] is None else Price.fromstring(item[1])),
map(lambda item: (parse_season_start_date(item[0]), item[1]),
map(lambda item: (column_names_by_identifier.get(item[0]), item[1]),
filter(lambda item: item[0] != GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE,
contract_values_by_column_identifier.items())))))),
Salary(
amount=parsed_guaranteed_salary.amount,
currency=parsed_guaranteed_salary.currency
))

raise ValueError("Unparseable player contract values")


def create_player_contract(player_contract_data: PlayerContractData):
salaries_by_season, guaranteed_salary = parse_player_contract_values(
contract_values_by_column_identifier=player_contract_data.row.values_by_header,
column_names_by_identifier=player_contract_data.headers
)
def deserialize_season_start_year(serialized_season: str) -> int:
"""
Parses the season strings for each contract year column. These strings have a form like "2024-25"
:param serialized_season: str representing the column name of a particular contract year season
:return: int representing the starting year for a given contract season
"""
parts = serialized_season.split("-")
if 1 >= len(parts):
raise ValueError(f"Unexpected contract season name: {serialized_season}")
return datetime.strptime(serialized_season.split("-")[0], "%Y").year


def deserialize_optional_salary(salary: Optional[str]) -> Optional[Salary]:
if salary:
parsed_amount = Price.fromstring(salary)
return Salary(amount=parsed_amount.amount, currency=parsed_amount.currency)


def deserialize_team(abbreviation: str) -> Team:
team = TEAM_ABBREVIATIONS_TO_TEAM.get(abbreviation, None)
if team:
return team

raise ValueError(f"Unable to deserialize team abbreviation: {abbreviation}")

return PlayerContract(
player=Player(
identifier=player_contract_data.row.id,
name=player_contract_data.row.name
),
team=TEAM_ABBREVIATIONS_TO_TEAM.get(player_contract_data.row.team_abbreviation, None),
salaries_by_season_start_year=salaries_by_season,
guaranteed_salary=guaranteed_salary

def deserialize_guaranteed_salary(contract_values_by_column_identifier: Dict[str, str]) -> Salary:
guaranteed_salary_value = deserialize_optional_salary(
contract_values_by_column_identifier.get(
GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, None
)
)
if guaranteed_salary_value:
return guaranteed_salary_value

raise ValueError(
f"Could not identify guaranteed salary value in header values: {contract_values_by_column_identifier}")


class PlayerContractParser:
def __init__(self,
salary_generator: SalariesBySeasonParser,
guaranteed_salary_generator: Callable[[Dict[str, Optional[str]]], Salary],
player_generator: Callable[[PlayerRowData], Player],
team_generator: Callable[[str], Team]):
self.salary_generator = salary_generator
self.guaranteed_salary_generator = guaranteed_salary_generator
self.player_generator = player_generator
self.team_generator = team_generator

def parse_table_data(self, data: PlayerContractTableData) -> PlayerContract:
return PlayerContract(
player=self.player_generator(data.row),
team=self.team_generator(data.row.team_abbreviation),
salaries_by_season_start_year=self.salary_generator.parse(data.row.values_by_header, data.headers),
guaranteed_salary=self.guaranteed_salary_generator(data.row.values_by_header)
)
20 changes: 15 additions & 5 deletions basketball_reference_web_scraper/errors.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import requests


class InvalidDate(Exception):
def __init__(self, day, month, year):
message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid"\
message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid" \
.format(
year=year,
month=month,
day=day,
)
year=year,
month=month,
day=day,
)
super().__init__(message)


Expand All @@ -20,3 +23,10 @@ def __init__(self, player_identifier, season_end_year):
message = "Player with identifier \"{player_identifier}\" in season ending in {season_end_year} is invalid" \
.format(player_identifier=player_identifier, season_end_year=season_end_year)
super().__init__(message)


class CouldNotGetPlayerContractData(Exception):
def __init__(self, response: requests.Response):
self._response = response

super().__init__(f"HTTP request to {self._response.url} was unsuccessful: {self._response.status_code}")
17 changes: 10 additions & 7 deletions basketball_reference_web_scraper/http_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@
from basketball_reference_web_scraper.contracts.page.parsers import PlayerContractsPageParser, NothingMoreToParse, \
PlayerContractData
from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION, TeamTotal, PlayerData
from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason


class CouldNotGetPlayerContractData(Exception):
pass
from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason, CouldNotGetPlayerContractData


class HTTPService:
Expand Down Expand Up @@ -254,19 +250,26 @@ def search(self, term):
}

def player_contracts(self, player_contract_processor: Callable[[PlayerContractData], PlayerContract]) -> None:
"""
Makes an HTTP request to fetch player contract content.
Streams through the HTML page content in chunks, passing parsed data to the specified callback.
This approach attempts to keep memory allocation to a minimum.
:param player_contract_processor:
:return:
"""
with requests.get(
url=f"{HTTPService.BASE_URL}/contracts/players.html",
stream=True,

) as response:
if not response.ok:
raise CouldNotGetPlayerContractData()
raise CouldNotGetPlayerContractData(response)

if response.encoding is None:
response.encoding = 'utf-8'

with PlayerContractsPageParser(player_contract_data_processor=player_contract_processor) as p:
for chunk in response.iter_content(chunk_size=500, decode_unicode=True):
for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
try:
p.parse(chunk=chunk)
except NothingMoreToParse:
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/client/test_player_contracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ def test_player_contracts(self, m):
text=self._html,
status_code=200)
data = []
player_contracts(player_contract_processor=lambda player_contract: data.append(player_contract))
player_contracts(player_contract_processor=data.append)
assert 496 == len(data)
Empty file.
19 changes: 19 additions & 0 deletions tests/integration/parsers/contracts/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from unittest import TestCase

from basketball_reference_web_scraper.contracts.data.parsers import SalariesBySeasonParser


class TestSalariesBySeasonParser(TestCase):
def setUp(self):
self.parser = SalariesBySeasonParser(
season_start_year_deserializer=
)

def test_unknown_column_name_raises_error(self):
raise NotImplementedError()

def test_season_start_year_deserialization_error_raises_error(self):
raise NotImplementedError()

def test_salary_deserialization_error_raises_error(self):
raise NotImplementedError()
1 change: 1 addition & 0 deletions tests/integration/parsers/contracts/test_page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Empty file.
Empty file.
73 changes: 73 additions & 0 deletions tests/unit/contracts/data/test_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import hashlib
import itertools
from decimal import Decimal
from unittest import TestCase

from basketball_reference_web_scraper.contracts.data.models import Salary
from basketball_reference_web_scraper.contracts.data.parsers import deserialize_season_start_year, deserialize_team, \
deserialize_guaranteed_salary, SalariesBySeasonParser
from basketball_reference_web_scraper.data import Team


class TestDeserializingSeasonStartYear(TestCase):
def test_non_numeric_value_raises_error(self):
with self.assertRaises(ValueError):
deserialize_season_start_year(serialized_season="foobar")

def test_invalidly_formatted_numeric_value_raises_error(self):
with self.assertRaises(ValueError):
deserialize_season_start_year(serialized_season="2024")

def test_validly_formatted_value_returns_value(self):
assert 2024 == deserialize_season_start_year(serialized_season="2024-25")


class TestDeserializingTeamAbbreviation(TestCase):
def test_invalid_abbreviation_raises_error(self):
with self.assertRaisesRegexp(ValueError, "Unable to deserialize team abbreviation: jaebaebae"):
deserialize_team("jaebaebae")

def test_valid_abbreviation_returns_team(self):
assert Team.BOSTON_CELTICS == deserialize_team("BOS")


class TestDeserializingGuaranteedSalary(TestCase):
def test_raise_error_guaranteed_salary_column_does_not_exist(self):
with self.assertRaises(ValueError):
deserialize_guaranteed_salary(contract_values_by_column_identifier={})

def test_raises_when_column_exists_but_value_is_an_empty_string(self):
with self.assertRaises(ValueError):
deserialize_guaranteed_salary(contract_values_by_column_identifier={"remain_gtd": ""})

def test_returns_salary_when_column_exists_and_value_is_not_an_empty_string(self):
assert Salary(amount=Decimal(1_234_567), currency="$") == deserialize_guaranteed_salary(
contract_values_by_column_identifier={"remain_gtd": "$1,234,567"}
)


class TestSalariesBySeasonParser(TestCase):
def test_valid_salaries_by_season(self):
def generate_salary():
for value in itertools.count(start=0, step=1):
yield Salary(amount=Decimal(value), currency="$")

salary_generator = generate_salary()
parser = SalariesBySeasonParser(
season_start_year_deserializer=lambda v: int.from_bytes(hashlib.md5(v.encode("utf-8")).digest()[:8],
'little', signed=True),
salary_deserializer=lambda v: None if v is None else next(salary_generator)
)
assert {6699318081062747564: Salary(amount=Decimal('0'), currency='$'),
-2012135647395072713: Salary(amount=Decimal('1'), currency='$')} == parser.parse(
contract_values_by_column_identifier={
"f": "bar",
"b": "jae",
"remain_gtd": "test"
},
column_names_by_identifier={
"f": "foo",
"b": "bar",
"remain_gtd": "guaranteed_money"
}
)
Loading