-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgame_urls_scraper.py
95 lines (74 loc) · 2.69 KB
/
game_urls_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import json
import urllib.request
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import threading
REQ_RETRIES = 5
def parse_game_urls(games):
game_urls = []
for game in games:
card_data = game["cardData"]
if card_data["seasonType"] == "Regular Season":
game_urls.append(game["cardData"]["shareUrl"].split("/")[-1])
if card_data["seasonType"] == "Playoffs":
print("Playoffs, stopping")
return game_urls, True
return game_urls, False
def scrape_games_by_day(day):
url = f"https://www.nba.com/games?date={day}"
response = urllib.request.urlopen(url).read()
soup = BeautifulSoup(response, "html.parser")
data = json.loads(soup.find('script', type='application/json', id="__NEXT_DATA__").text)
games_as_list = data["props"]["pageProps"]["gameCardFeed"]["modules"]
if not games_as_list:
print("No games this day - skipping")
return [], False
game_cards_list = games_as_list[0]["cards"]
return parse_game_urls(game_cards_list)
def generate_dates_for_year(year):
start_date = datetime(year, 10, 15)
end_date = datetime(year + 1, 6, 1)
all_days = []
current_date = start_date
while current_date < end_date:
all_days.append(current_date.strftime("%Y-%m-%d"))
current_date += timedelta(days=1)
return all_days
def scrape_games_ids_by_year(year):
game_urls = []
days_of_year = generate_dates_for_year(year)
should_stop = False
for day in days_of_year:
print(f"fetching games for {day=}")
done = False
i = 0
while not done and i < REQ_RETRIES:
try:
game_urls_for_day, should_stop = scrape_games_by_day(day)
game_urls.extend(game_urls_for_day)
done = True
print("Done")
except Exception as e:
print(f"caught error {e}, retrying...")
i += 1
if i == REQ_RETRIES:
print(f"failed fetching games for {day=}, moving on...")
with open("data/unsuccessful_days.txt", 'a') as f:
f.write(day)
f.flush()
if should_stop:
break
with open(f'data/game_ids/{year}.txt', 'w') as f:
f.write("\n".join(game_urls))
f.flush()
def main():
for base_year in range(1996, 2019, 4):
threads = []
for year in [base_year, base_year+1, base_year+2, base_year+3]:
thread = threading.Thread(target=scrape_games_ids_by_year, args=(year,))
threads.append(thread)
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
main()