learningequality · the-ivii · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -136,3 +136,4 @@ video_cache_py3.sqlite
 cache.sqlite
 
 chefdata/
+audio_cache.sqlite
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/audio_cache.sqlite b/audio_cache.sqlite
diff --git a/ricecooker/managers/tree.py b/ricecooker/managers/tree.py
@@ -144,6 +144,7 @@ def get_file_diff(self, files_to_diff):
                 if not exists
             ]
 
+
     def do_file_upload(self, filename):
         file_data = self.file_map[filename]
         if file_data.skip_upload:

diff --git a/ricecooker/utils/downloader.py b/ricecooker/utils/downloader.py
@@ -48,6 +48,22 @@
 }
 
 
+def configure_download_session(session, user_email=None):
+    """
+    Configure the download session with a custom User-Agent header.
+
+    Args:
+        session: The requests session to configure
+        user_email: Optional user email for User-Agent generation
+    """
+    import ricecooker
+
+    base_agent = f"Ricecooker/{ricecooker.__version__}"
+    user_agent = f"{base_agent} bot ({user_email or '[email protected]'})"
+
+    session.headers.update({'User-Agent': user_agent})
+
+
 USE_PYPPETEER = False
 
 # HACK ALERT! This is to allow ArchiveDownloader to be used from within link scraping.
@@ -197,7 +213,7 @@ def read(
 
 
 def make_request(
-    url, clear_cookies=False, headers=None, timeout=60, session=None, *args, **kwargs
+    url, clear_cookies=False, headers=None, timeout=60, session=None, user_email=None, *args, **kwargs
 ):
     sess = session or DOWNLOAD_SESSION
 
@@ -207,6 +223,9 @@ def make_request(
     retry_count = 0
     max_retries = 5
     request_headers = DEFAULT_HEADERS
+
+    configure_download_session(sess, user_email)
+
     if headers:
         request_headers = copy.copy(DEFAULT_HEADERS)
         request_headers.update(headers)

diff --git a/setup.py b/setup.py
@@ -26,6 +26,7 @@
         "console_scripts": [
             "corrections = ricecooker.utils.corrections:correctionsmain",
             "jiro = ricecooker.cli:main",
+            "ricecooker = ricecooker.cli:main",
         ]
     },
     include_package_data=True,

diff --git a/tests/test_downloader.py b/tests/test_downloader.py
@@ -1,7 +1,13 @@
 import os
 import unittest
+import timeit
+import requests
+import ricecooker
+import pytest
 
 from ricecooker.utils import downloader
+from ricecooker.utils.downloader import make_request
+from ricecooker.utils.downloader import configure_download_session
 
 
 class TestArchiver(unittest.TestCase):
@@ -70,3 +76,66 @@ def test_archive_path_as_relative_url(self):
             link_filename, page_filename
         )
         assert rel_path == "../kolibri_1.2.3.png"
+
+
+def test_useragent_generation():
+
+    session_no_email = requests.Session()
+    configure_download_session(session_no_email)
+    expected_no_email = f"Ricecooker/{ricecooker.__version__} bot ([email protected])"
+    assert session_no_email.headers['User-Agent'] == expected_no_email
+
+    session_with_email = requests.Session()
+    test_email = "[email protected]"
+    configure_download_session(session_with_email, user_email=test_email)
+    expected_with_email = f"Ricecooker/{ricecooker.__version__} bot ({test_email})"
+    assert session_with_email.headers['User-Agent'] == expected_with_email
+
+
+def test_request_retry_logic():
+    unreliable_url = "http://non-existent-url.test"
+
+    with pytest.raises(requests.exceptions.RequestException):
+        make_request(
+            unreliable_url,
+            user_email="[email protected]",
+            timeout=1
+        )
+
+
+def test_performance_overhead():
+    """
+    Measure performance impact of User-Agent header generation
+    """
+
+
+def baseline_request():
+    make_request("https://example.com")
+
+
+def custom_email_request():
+    make_request("https://example.com", user_email="[email protected]")
+
+
+baseline_time = timeit.timeit(baseline_request, number=100)
+custom_email_time = timeit.timeit(custom_email_request, number=100)
+
+assert custom_email_time - baseline_time < 0.01
+
+
+def test_useragent_content_validation():
+    """
+    Comprehensive validation of User-Agent header contents
+    """
+    session = requests.Session()
+    test_email = "[email protected]"
+    configure_download_session(session, user_email=test_email)
+
+    user_agent = session.headers['User-Agent']
+
+    # Validation checks
+    assert "Ricecooker/" in user_agent
+    assert ricecooker.__version__ in user_agent
+    assert test_email in user_agent
+    assert user_agent.startswith("Ricecooker/")
+    assert "bot" in user_agent
Original file line number	Diff line number	Diff line change
Expand Up		@@ -136,3 +136,4 @@ video_cache_py3.sqlite
		cache.sqlite

		chefdata/
		audio_cache.sqlite
Copy link Member rtibbles Feb 8, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. This has been added in develop from your other PR - you are still committing the file itself here as well too, so that needs to be removed from the commit history.