diff --git a/basketball_reference_web_scraper/client.py b/basketball_reference_web_scraper/client.py index ededda1c..e2f308d8 100644 --- a/basketball_reference_web_scraper/client.py +++ b/basketball_reference_web_scraper/client.py @@ -3,7 +3,9 @@ import requests from basketball_reference_web_scraper.contracts.data.models import PlayerContract -from basketball_reference_web_scraper.contracts.data.parsers import create_player_contract +from basketball_reference_web_scraper.contracts.data.parsers import PlayerContractParser, \ + SalariesBySeasonParser, deserialize_season_start_year, deserialize_guaranteed_salary, \ + deserialize_optional_salary, deserialize_team from basketball_reference_web_scraper.errors import InvalidSeason, InvalidDate, InvalidPlayerAndSeason from basketball_reference_web_scraper.http_service import HTTPService from basketball_reference_web_scraper.output.columns import BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, \ @@ -16,6 +18,14 @@ SearchCSVWriter from basketball_reference_web_scraper.parser_service import ParserService +player_contract_parser = PlayerContractParser( + salary_generator=SalariesBySeasonParser(season_start_year_deserializer=deserialize_season_start_year, + salary_deserializer=deserialize_optional_salary), + guaranteed_salary_generator=deserialize_guaranteed_salary, + player_generator=lambda row: Player(identifier=row.id, name=row.name), + team_generator=deserialize_team, +) + def standings(season_end_year, output_type=None, output_file_path=None, output_write_option=None, json_options=None): @@ -257,6 +267,17 @@ def search(term, output_type=None, output_file_path=None, output_write_option=No def player_contracts(player_contract_processor: Callable[[PlayerContract], Any]): + """ + Parses player contract data found on this page: https://www.basketball-reference.com/contracts/players.html + PlayerContract results are processed by the client caller via a callback. + + For example: + client.player_contracts(player_contract_processor=lambda player_contract_data: print(player_contract_data)) + + :param player_contract_processor: + :return: None + :raises: CouldNotGetPlayerContractData + """ HTTPService(parser=ParserService()).player_contracts( player_contract_processor=lambda player_row_contract_data: player_contract_processor( - create_player_contract(player_row_contract_data))) + player_contract_parser.parse_table_data(data=player_row_contract_data))) diff --git a/basketball_reference_web_scraper/contracts/data/parsers.py b/basketball_reference_web_scraper/contracts/data/parsers.py index a49124b5..18741c3a 100644 --- a/basketball_reference_web_scraper/contracts/data/parsers.py +++ b/basketball_reference_web_scraper/contracts/data/parsers.py @@ -1,59 +1,94 @@ from datetime import datetime -from typing import Dict +from typing import Dict, Optional -from coverage.types import Protocol from price_parser import Price from basketball_reference_web_scraper.contracts.data.models import Salary, PlayerContract, Player -from basketball_reference_web_scraper.contracts.page.parsers import PlayerRowData, PlayerContractData -from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM +from basketball_reference_web_scraper.contracts.page.parsers import PlayerContractData as PlayerContractTableData, \ + PlayerRowData +from basketball_reference_web_scraper.data import TEAM_ABBREVIATIONS_TO_TEAM, Team GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE = "remain_gtd" +from typing import Callable -class PlayerContractRowDataProcessor(Protocol): - def process_row(self, headers: Dict[str, str], row_data: PlayerRowData) -> PlayerContract: - raise NotImplementedError() +class SalariesBySeasonParser: + def __init__(self, season_start_year_deserializer: Callable[[str], int], + salary_deserializer: Callable[[Optional[str]], Optional[Salary]]): + self._season_start_year_deserializer = season_start_year_deserializer + self._salary_deserializer = salary_deserializer -def parse_season_start_date(serialized_season: str) -> int: - return datetime.strptime(serialized_season, "%Y-%y").year + def parse(self, contract_values_by_column_identifier: Dict[str, str], + column_names_by_identifier: Dict[str, str]) -> Dict[int, Optional[Salary]]: + return dict( + map(lambda item: (item[0], self._salary_deserializer(item[1])), + map(lambda column_name_and_value: ( + self._season_start_year_deserializer(column_name_and_value[0]), column_name_and_value[1]), + map( + lambda season_salary_columns_by_value: ( + column_names_by_identifier[season_salary_columns_by_value[0]], + season_salary_columns_by_value[1]), + filter( + lambda value_and_column_identifier: + value_and_column_identifier[0] != GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, + contract_values_by_column_identifier.items()))))) -def parse_player_contract_values(contract_values_by_column_identifier: Dict[str, str], - column_names_by_identifier: Dict[str, str]): - guaranteed_salary_value = contract_values_by_column_identifier.get(GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, None) - if guaranteed_salary_value: - parsed_guaranteed_salary = Price.fromstring(price=guaranteed_salary_value) - - return ( - dict(map(lambda item: ( - item[0], None if item[1] is None else Salary(amount=item[1].amount, currency=item[1].currency)), - map(lambda item: (item[0], None if item[1] is None else Price.fromstring(item[1])), - map(lambda item: (parse_season_start_date(item[0]), item[1]), - map(lambda item: (column_names_by_identifier.get(item[0]), item[1]), - filter(lambda item: item[0] != GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, - contract_values_by_column_identifier.items())))))), - Salary( - amount=parsed_guaranteed_salary.amount, - currency=parsed_guaranteed_salary.currency - )) - - raise ValueError("Unparseable player contract values") - - -def create_player_contract(player_contract_data: PlayerContractData): - salaries_by_season, guaranteed_salary = parse_player_contract_values( - contract_values_by_column_identifier=player_contract_data.row.values_by_header, - column_names_by_identifier=player_contract_data.headers - ) +def deserialize_season_start_year(serialized_season: str) -> int: + """ + Parses the season strings for each contract year column. These strings have a form like "2024-25" + :param serialized_season: str representing the column name of a particular contract year season + :return: int representing the starting year for a given contract season + """ + parts = serialized_season.split("-") + if 1 >= len(parts): + raise ValueError(f"Unexpected contract season name: {serialized_season}") + return datetime.strptime(serialized_season.split("-")[0], "%Y").year + + +def deserialize_optional_salary(salary: Optional[str]) -> Optional[Salary]: + if salary: + parsed_amount = Price.fromstring(salary) + return Salary(amount=parsed_amount.amount, currency=parsed_amount.currency) + + +def deserialize_team(abbreviation: str) -> Team: + team = TEAM_ABBREVIATIONS_TO_TEAM.get(abbreviation, None) + if team: + return team + + raise ValueError(f"Unable to deserialize team abbreviation: {abbreviation}") - return PlayerContract( - player=Player( - identifier=player_contract_data.row.id, - name=player_contract_data.row.name - ), - team=TEAM_ABBREVIATIONS_TO_TEAM.get(player_contract_data.row.team_abbreviation, None), - salaries_by_season_start_year=salaries_by_season, - guaranteed_salary=guaranteed_salary + +def deserialize_guaranteed_salary(contract_values_by_column_identifier: Dict[str, str]) -> Salary: + guaranteed_salary_value = deserialize_optional_salary( + contract_values_by_column_identifier.get( + GUARANTEED_SALARY_COLUMN_DATA_STAT_VALUE, None + ) ) + if guaranteed_salary_value: + return guaranteed_salary_value + + raise ValueError( + f"Could not identify guaranteed salary value in header values: {contract_values_by_column_identifier}") + + +class PlayerContractParser: + def __init__(self, + salary_generator: SalariesBySeasonParser, + guaranteed_salary_generator: Callable[[Dict[str, Optional[str]]], Salary], + player_generator: Callable[[PlayerRowData], Player], + team_generator: Callable[[str], Team]): + self.salary_generator = salary_generator + self.guaranteed_salary_generator = guaranteed_salary_generator + self.player_generator = player_generator + self.team_generator = team_generator + + def parse_table_data(self, data: PlayerContractTableData) -> PlayerContract: + return PlayerContract( + player=self.player_generator(data.row), + team=self.team_generator(data.row.team_abbreviation), + salaries_by_season_start_year=self.salary_generator.parse(data.row.values_by_header, data.headers), + guaranteed_salary=self.guaranteed_salary_generator(data.row.values_by_header) + ) diff --git a/basketball_reference_web_scraper/errors.py b/basketball_reference_web_scraper/errors.py index 12b574f8..861414e3 100644 --- a/basketball_reference_web_scraper/errors.py +++ b/basketball_reference_web_scraper/errors.py @@ -1,11 +1,14 @@ +import requests + + class InvalidDate(Exception): def __init__(self, day, month, year): - message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid"\ + message = "Date with year set to {year}, month set to {month}, and day set to {day} is invalid" \ .format( - year=year, - month=month, - day=day, - ) + year=year, + month=month, + day=day, + ) super().__init__(message) @@ -20,3 +23,10 @@ def __init__(self, player_identifier, season_end_year): message = "Player with identifier \"{player_identifier}\" in season ending in {season_end_year} is invalid" \ .format(player_identifier=player_identifier, season_end_year=season_end_year) super().__init__(message) + + +class CouldNotGetPlayerContractData(Exception): + def __init__(self, response: requests.Response): + self._response = response + + super().__init__(f"HTTP request to {self._response.url} was unsuccessful: {self._response.status_code}") diff --git a/basketball_reference_web_scraper/http_service.py b/basketball_reference_web_scraper/http_service.py index cdd49fb2..3d12a0b7 100644 --- a/basketball_reference_web_scraper/http_service.py +++ b/basketball_reference_web_scraper/http_service.py @@ -11,11 +11,7 @@ from basketball_reference_web_scraper.contracts.page.parsers import PlayerContractsPageParser, NothingMoreToParse, \ PlayerContractData from basketball_reference_web_scraper.data import TEAM_TO_TEAM_ABBREVIATION, TeamTotal, PlayerData -from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason - - -class CouldNotGetPlayerContractData(Exception): - pass +from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason, CouldNotGetPlayerContractData class HTTPService: @@ -254,19 +250,26 @@ def search(self, term): } def player_contracts(self, player_contract_processor: Callable[[PlayerContractData], PlayerContract]) -> None: + """ + Makes an HTTP request to fetch player contract content. + Streams through the HTML page content in chunks, passing parsed data to the specified callback. + This approach attempts to keep memory allocation to a minimum. + :param player_contract_processor: + :return: + """ with requests.get( url=f"{HTTPService.BASE_URL}/contracts/players.html", stream=True, ) as response: if not response.ok: - raise CouldNotGetPlayerContractData() + raise CouldNotGetPlayerContractData(response) if response.encoding is None: response.encoding = 'utf-8' with PlayerContractsPageParser(player_contract_data_processor=player_contract_processor) as p: - for chunk in response.iter_content(chunk_size=500, decode_unicode=True): + for chunk in response.iter_content(chunk_size=1024, decode_unicode=True): try: p.parse(chunk=chunk) except NothingMoreToParse: diff --git a/tests/integration/client/test_player_contracts.py b/tests/integration/client/test_player_contracts.py index 40355205..78dd624f 100644 --- a/tests/integration/client/test_player_contracts.py +++ b/tests/integration/client/test_player_contracts.py @@ -19,5 +19,5 @@ def test_player_contracts(self, m): text=self._html, status_code=200) data = [] - player_contracts(player_contract_processor=lambda player_contract: data.append(player_contract)) + player_contracts(player_contract_processor=data.append) assert 496 == len(data) diff --git a/tests/integration/parsers/contracts/__init__.py b/tests/integration/parsers/contracts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/integration/parsers/contracts/test_data.py b/tests/integration/parsers/contracts/test_data.py new file mode 100644 index 00000000..b21d1113 --- /dev/null +++ b/tests/integration/parsers/contracts/test_data.py @@ -0,0 +1,19 @@ +from unittest import TestCase + +from basketball_reference_web_scraper.contracts.data.parsers import SalariesBySeasonParser + + +class TestSalariesBySeasonParser(TestCase): + def setUp(self): + self.parser = SalariesBySeasonParser( + season_start_year_deserializer= + ) + + def test_unknown_column_name_raises_error(self): + raise NotImplementedError() + + def test_season_start_year_deserialization_error_raises_error(self): + raise NotImplementedError() + + def test_salary_deserialization_error_raises_error(self): + raise NotImplementedError() \ No newline at end of file diff --git a/tests/integration/parsers/contracts/test_page.py b/tests/integration/parsers/contracts/test_page.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/tests/integration/parsers/contracts/test_page.py @@ -0,0 +1 @@ + diff --git a/tests/unit/contracts/__init__.py b/tests/unit/contracts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/contracts/data/__init__.py b/tests/unit/contracts/data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit/contracts/data/test_parsers.py b/tests/unit/contracts/data/test_parsers.py new file mode 100644 index 00000000..93ea6e17 --- /dev/null +++ b/tests/unit/contracts/data/test_parsers.py @@ -0,0 +1,73 @@ +import hashlib +import itertools +from decimal import Decimal +from unittest import TestCase + +from basketball_reference_web_scraper.contracts.data.models import Salary +from basketball_reference_web_scraper.contracts.data.parsers import deserialize_season_start_year, deserialize_team, \ + deserialize_guaranteed_salary, SalariesBySeasonParser +from basketball_reference_web_scraper.data import Team + + +class TestDeserializingSeasonStartYear(TestCase): + def test_non_numeric_value_raises_error(self): + with self.assertRaises(ValueError): + deserialize_season_start_year(serialized_season="foobar") + + def test_invalidly_formatted_numeric_value_raises_error(self): + with self.assertRaises(ValueError): + deserialize_season_start_year(serialized_season="2024") + + def test_validly_formatted_value_returns_value(self): + assert 2024 == deserialize_season_start_year(serialized_season="2024-25") + + +class TestDeserializingTeamAbbreviation(TestCase): + def test_invalid_abbreviation_raises_error(self): + with self.assertRaisesRegexp(ValueError, "Unable to deserialize team abbreviation: jaebaebae"): + deserialize_team("jaebaebae") + + def test_valid_abbreviation_returns_team(self): + assert Team.BOSTON_CELTICS == deserialize_team("BOS") + + +class TestDeserializingGuaranteedSalary(TestCase): + def test_raise_error_guaranteed_salary_column_does_not_exist(self): + with self.assertRaises(ValueError): + deserialize_guaranteed_salary(contract_values_by_column_identifier={}) + + def test_raises_when_column_exists_but_value_is_an_empty_string(self): + with self.assertRaises(ValueError): + deserialize_guaranteed_salary(contract_values_by_column_identifier={"remain_gtd": ""}) + + def test_returns_salary_when_column_exists_and_value_is_not_an_empty_string(self): + assert Salary(amount=Decimal(1_234_567), currency="$") == deserialize_guaranteed_salary( + contract_values_by_column_identifier={"remain_gtd": "$1,234,567"} + ) + + +class TestSalariesBySeasonParser(TestCase): + def test_valid_salaries_by_season(self): + def generate_salary(): + for value in itertools.count(start=0, step=1): + yield Salary(amount=Decimal(value), currency="$") + + salary_generator = generate_salary() + parser = SalariesBySeasonParser( + season_start_year_deserializer=lambda v: int.from_bytes(hashlib.md5(v.encode("utf-8")).digest()[:8], + 'little', signed=True), + salary_deserializer=lambda v: None if v is None else next(salary_generator) + ) + assert {6699318081062747564: Salary(amount=Decimal('0'), currency='$'), + -2012135647395072713: Salary(amount=Decimal('1'), currency='$')} == parser.parse( + contract_values_by_column_identifier={ + "f": "bar", + "b": "jae", + "remain_gtd": "test" + }, + column_names_by_identifier={ + "f": "foo", + "b": "bar", + "remain_gtd": "guaranteed_money" + } + )