Skip to content

Commit cc1ae57

Browse files
authored
Merge pull request #226 from Joyakis/refactor/shared-get-session
Centralize get_session()
2 parents 0d44547 + 6efdf71 commit cc1ae57

File tree

6 files changed

+32
-86
lines changed

6 files changed

+32
-86
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
from pygments import highlight
2323
from pygments.formatters import TerminalFormatter
2424
from pygments.lexers import PythonTracebackLexer
25-
from requests.adapters import HTTPAdapter
26-
from urllib3.util.retry import Retry
2725

2826
# Add parent directory so shared can be imported
2927
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -335,19 +333,6 @@ def initialize_all_data_files(args):
335333
initialize_data_file(FILE_ARXIV_AUTHOR_BUCKET, HEADER_AUTHOR_BUCKET)
336334

337335

338-
def get_requests_session():
339-
"""Create request session with retry logic"""
340-
retry_strategy = Retry(
341-
total=5,
342-
backoff_factor=10,
343-
status_forcelist=shared.STATUS_FORCELIST,
344-
)
345-
session = requests.Session()
346-
session.headers.update({"User-Agent": shared.USER_AGENT})
347-
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
348-
return session
349-
350-
351336
def normalize_license_text(raw_text):
352337
"""
353338
Convert raw license text to standardized CC license identifiers.
@@ -533,7 +518,7 @@ def query_arxiv(args):
533518
"""
534519

535520
LOGGER.info("Beginning to fetch results from ArXiv API")
536-
session = get_requests_session()
521+
session = shared.get_session()
537522

538523
results_per_iteration = 50
539524

scripts/1-fetch/europeana_fetch.py

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
from pygments import highlight
2424
from pygments.formatters import TerminalFormatter
2525
from pygments.lexers import PythonTracebackLexer
26-
from requests.adapters import HTTPAdapter, Retry
2726

2827
# Add parent directory for shared imports
2928
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -103,19 +102,6 @@ def parse_arguments():
103102
return args
104103

105104

106-
def get_requests_session():
107-
"""Create a requests session with retry."""
108-
max_retries = Retry(
109-
total=5, backoff_factor=10, status_forcelist=shared.STATUS_FORCELIST
110-
)
111-
session = requests.Session()
112-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
113-
session.headers.update(
114-
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
115-
)
116-
return session
117-
118-
119105
def simplify_legal_tool(legal_tool):
120106
"""Simplify license URLs into human-readable labels
121107
@@ -433,7 +419,7 @@ def main():
433419
"EUROPEANA_API_KEY not found in environment variables", 1
434420
)
435421

436-
session = get_requests_session()
422+
session = shared.get_session(accept_header="application/json")
437423

438424
# Fetch facet lists once, including counts
439425
providers_full = get_facet_list(session, "DATA_PROVIDER")

scripts/1-fetch/github_fetch.py

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@
1717
from pygments import highlight
1818
from pygments.formatters import TerminalFormatter
1919
from pygments.lexers import PythonTracebackLexer
20-
from requests.adapters import HTTPAdapter
21-
from urllib3.util.retry import Retry
2220

2321
# Add parent directory so shared can be imported
2422
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -80,25 +78,6 @@ def check_for_completion():
8078
pass # File may not be found without --enable-save, etc.
8179

8280

83-
def get_requests_session():
84-
max_retries = Retry(
85-
total=5,
86-
backoff_factor=10,
87-
status_forcelist=shared.STATUS_FORCELIST,
88-
)
89-
session = requests.Session()
90-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
91-
headers = {
92-
"accept": "application/vnd.github+json",
93-
"User-Agent": shared.USER_AGENT,
94-
}
95-
if GH_TOKEN:
96-
headers["authorization"] = f"Bearer {GH_TOKEN}"
97-
session.headers.update(headers)
98-
99-
return session
100-
101-
10281
def write_data(args, tool_data):
10382
if not args.enable_save:
10483
return args
@@ -162,7 +141,12 @@ def main():
162141
args = parse_arguments()
163142
shared.paths_log(LOGGER, PATHS)
164143
check_for_completion()
165-
session = get_requests_session()
144+
session = shared.get_session(
145+
accept_header="application/vnd.github+json",
146+
)
147+
if GH_TOKEN:
148+
session.headers.update({"authorization": f"Bearer {GH_TOKEN}"})
149+
166150
tool_data = query_github(args, session)
167151
args = write_data(args, tool_data)
168152
args = shared.git_add_and_commit(

scripts/1-fetch/openverse_fetch.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,6 @@
2525
from pygments import highlight
2626
from pygments.formatters import TerminalFormatter
2727
from pygments.lexers import PythonTracebackLexer
28-
from requests.adapters import HTTPAdapter
29-
from urllib3.util.retry import Retry
3028

3129
# Add parent directory so shared can be imported
3230
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -83,20 +81,6 @@ def parse_arguments():
8381
return args
8482

8583

86-
def get_requests_session():
87-
max_retries = Retry(
88-
total=5,
89-
backoff_factor=10,
90-
status_forcelist=shared.STATUS_FORCELIST,
91-
)
92-
session = requests.Session()
93-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
94-
session.headers.update(
95-
{"accept": "application/json", "User-Agent": shared.USER_AGENT}
96-
)
97-
return session
98-
99-
10084
def get_all_sources_and_licenses(session, media_type):
10185
"""
10286
Fetch all available sources for a given media_type.
@@ -225,8 +209,8 @@ def write_data(args, data):
225209

226210
def main():
227211
args = parse_arguments()
228-
session = get_requests_session()
229212
LOGGER.info("Starting Openverse Fetch Script...")
213+
session = shared.get_session(accept_header="application/json")
230214
records = query_openverse(session)
231215
write_data(args, records)
232216
LOGGER.info(f"Fetched {len(records)} unique Openverse records.")

scripts/1-fetch/wikipedia_fetch.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,9 @@
1313
from operator import itemgetter
1414

1515
# Third-party
16-
import requests
1716
from pygments import highlight
1817
from pygments.formatters import TerminalFormatter
1918
from pygments.lexers import PythonTracebackLexer
20-
from requests.adapters import HTTPAdapter
21-
from urllib3.util.retry import Retry
2219

2320
# Add parent directory so shared can be imported
2421
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
@@ -66,18 +63,6 @@ def parse_arguments():
6663
return args
6764

6865

69-
def get_requests_session():
70-
max_retries = Retry(
71-
total=5,
72-
backoff_factor=10,
73-
status_forcelist=shared.STATUS_FORCELIST,
74-
)
75-
session = requests.Session()
76-
session.mount("https://", HTTPAdapter(max_retries=max_retries))
77-
session.headers.update({"User-Agent": shared.USER_AGENT})
78-
return session
79-
80-
8166
def write_data(args, tool_data):
8267
if not args.enable_save:
8368
return args
@@ -173,7 +158,8 @@ def main():
173158
args = parse_arguments()
174159
shared.paths_log(LOGGER, PATHS)
175160
shared.git_fetch_and_merge(args, PATHS["repo"])
176-
tool_data = query_wikipedia_languages(get_requests_session())
161+
session = shared.get_session()
162+
tool_data = query_wikipedia_languages(session)
177163
args = write_data(args, tool_data)
178164
args = shared.git_add_and_commit(
179165
args,

scripts/shared.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# Third-party
88
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
99
from pandas import PeriodIndex
10+
from requests import Session
11+
from requests.adapters import HTTPAdapter, Retry
1012

1113
# Constants
1214
STATUS_FORCELIST = [
@@ -31,6 +33,25 @@ def __init__(self, message, exit_code=None):
3133
super().__init__(self.message)
3234

3335

36+
def get_session(accept_header=None):
37+
"""Create a reusable HTTP session with retry logic."""
38+
session = Session()
39+
40+
retry_strategy = Retry(
41+
total=5,
42+
backoff_factor=10,
43+
status_forcelist=STATUS_FORCELIST,
44+
)
45+
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
46+
47+
headers = {"User-Agent": USER_AGENT}
48+
if accept_header:
49+
headers["accept"] = accept_header
50+
session.headers.update(headers)
51+
52+
return session
53+
54+
3455
def git_fetch_and_merge(args, repo_path, branch=None):
3556
if not args.enable_git:
3657
return

0 commit comments

Comments
 (0)