Skip to content

Commit c93d87b

Browse files
committed
Update internetarchive fetching to use shared session
1 parent 990d535 commit c93d87b

File tree

2 files changed

+16
-24
lines changed

2 files changed

+16
-24
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from pygments import highlight
2424
from pygments.formatters import TerminalFormatter
2525
from pygments.lexers import PythonTracebackLexer
26-
from urllib3.util.retry import Retry
2726

2827
# Add parent directory so shared can be imported
2928
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -64,24 +63,6 @@ def parse_arguments():
6463
return args
6564

6665

67-
def get_archive_session():
68-
retry_strategy = Retry(
69-
total=5,
70-
backoff_factor=10,
71-
status_forcelist=shared.STATUS_FORCELIST,
72-
allowed_methods=["GET", "POST"],
73-
raise_on_status=False,
74-
)
75-
adapter_kwargs = {
76-
"max_retries": retry_strategy,
77-
}
78-
session = ArchiveSession(http_adapter_kwargs=adapter_kwargs)
79-
session.headers.update(
80-
{"User-Agent": shared.USER_AGENT, "Accept": "application/json"}
81-
)
82-
return session
83-
84-
8566
def load_license_mapping():
8667
"""Loads and normalizes the license mapping from CSV."""
8768
license_mapping = {}
@@ -174,7 +155,7 @@ def iso639_lookup(term):
174155

175156

176157
# strip common noise like "subtitles", "subtitle",
177-
# "(English)", "english patch", "handwritten", etc
158+
# "(English)", "english patch", "handwritten", etc.
178159
def strip_noise(s):
179160
# Helper to find words with flexible boundaries
180161
def word_regex(word):
@@ -330,7 +311,9 @@ def query_internet_archive(args):
330311
total_processed = 0
331312
max_retries = 3
332313

333-
session = get_archive_session()
314+
session = shared.get_session(
315+
accept_header="application/json", session=ArchiveSession()
316+
)
334317
while True:
335318
# Loop until no more results are returned by the API
336319
LOGGER.info(f"Fetching {rows} items starting at {total_rows}...")

scripts/shared.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,23 @@ def __init__(self, message, exit_code=None):
3333
super().__init__(self.message)
3434

3535

36-
def get_session(accept_header=None):
37-
"""Create a reusable HTTP session with retry logic."""
38-
session = Session()
36+
# def get_session(accept_header=None):
37+
# """Create a reusable HTTP session with retry logic."""
38+
# session = Session()
39+
40+
41+
def get_session(accept_header=None, session=None):
42+
"""Create or configure a reusable HTTP session
43+
with retry logic and headers."""
44+
if session is None:
45+
session = Session()
3946

4047
retry_strategy = Retry(
4148
total=5,
4249
backoff_factor=10,
4350
status_forcelist=STATUS_FORCELIST,
51+
allowed_methods=["GET", "POST"],
52+
raise_on_status=False,
4453
)
4554
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
4655

0 commit comments

Comments
 (0)