From 26ec2a21f64401a631e47aeceb1bf79c987aadf9 Mon Sep 17 00:00:00 2001
From: Mukesh-Ghildiyal <mukeshghildiyal435@gmail.com>
Date: Wed, 29 Oct 2025 22:20:43 +0530
Subject: [PATCH] internet archive

---
 data/ia_license_mapping.csv                  | 108 ++++
 scripts/1-fetch/internetarchive_fetch.py     | 491 ++++++++++++++
 scripts/2-process/internetarchive_process.py | 484 ++++++++++++++
 scripts/3-report/internetarchive_report.py   | 647 +++++++++++++++++++
 test_ia_pipeline.py                          | 172 +++++
 5 files changed, 1902 insertions(+)
 create mode 100644 data/ia_license_mapping.csv
 create mode 100644 scripts/1-fetch/internetarchive_fetch.py
 create mode 100644 scripts/2-process/internetarchive_process.py
 create mode 100644 scripts/3-report/internetarchive_report.py
 create mode 100644 test_ia_pipeline.py

diff --git a/data/ia_license_mapping.csv b/data/ia_license_mapping.csv
new file mode 100644
index 00000000..97646dff
--- /dev/null
+++ b/data/ia_license_mapping.csv
@@ -0,0 +1,108 @@
+IA_LICENSE_URL,NORMALIZED_LICENSE
+http://creativecommons.org/licenses/by/4.0/,CC BY 4.0
+https://creativecommons.org/licenses/by/4.0/,CC BY 4.0
+http://creativecommons.org/licenses/by-sa/4.0/,CC BY-SA 4.0
+https://creativecommons.org/licenses/by-sa/4.0/,CC BY-SA 4.0
+http://creativecommons.org/licenses/by-nc/4.0/,CC BY-NC 4.0
+https://creativecommons.org/licenses/by-nc/4.0/,CC BY-NC 4.0
+http://creativecommons.org/licenses/by-nc-sa/4.0/,CC BY-NC-SA 4.0
+https://creativecommons.org/licenses/by-nc-sa/4.0/,CC BY-NC-SA 4.0
+http://creativecommons.org/licenses/by-nd/4.0/,CC BY-ND 4.0
+https://creativecommons.org/licenses/by-nd/4.0/,CC BY-ND 4.0
+http://creativecommons.org/licenses/by-nc-nd/4.0/,CC BY-NC-ND 4.0
+https://creativecommons.org/licenses/by-nc-nd/4.0/,CC BY-NC-ND 4.0
+http://creativecommons.org/licenses/by/3.0/,CC BY 3.0
+https://creativecommons.org/licenses/by/3.0/,CC BY 3.0
+http://creativecommons.org/licenses/by-sa/3.0/,CC BY-SA 3.0
+https://creativecommons.org/licenses/by-sa/3.0/,CC BY-SA 3.0
+http://creativecommons.org/licenses/by-nc/3.0/,CC BY-NC 3.0
+https://creativecommons.org/licenses/by-nc/3.0/,CC BY-NC 3.0
+http://creativecommons.org/licenses/by-nc-sa/3.0/,CC BY-NC-SA 3.0
+https://creativecommons.org/licenses/by-nc-sa/3.0/,CC BY-NC-SA 3.0
+http://creativecommons.org/licenses/by-nd/3.0/,CC BY-ND 3.0
+https://creativecommons.org/licenses/by-nd/3.0/,CC BY-ND 3.0
+http://creativecommons.org/licenses/by-nc-nd/3.0/,CC BY-NC-ND 3.0
+https://creativecommons.org/licenses/by-nc-nd/3.0/,CC BY-NC-ND 3.0
+http://creativecommons.org/licenses/by/2.5/,CC BY 2.5
+https://creativecommons.org/licenses/by/2.5/,CC BY 2.5
+http://creativecommons.org/licenses/by-sa/2.5/,CC BY-SA 2.5
+https://creativecommons.org/licenses/by-sa/2.5/,CC BY-SA 2.5
+http://creativecommons.org/licenses/by-nc/2.5/,CC BY-NC 2.5
+https://creativecommons.org/licenses/by-nc/2.5/,CC BY-NC 2.5
+http://creativecommons.org/licenses/by-nc-sa/2.5/,CC BY-NC-SA 2.5
+https://creativecommons.org/licenses/by-nc-sa/2.5/,CC BY-NC-SA 2.5
+http://creativecommons.org/licenses/by-nd/2.5/,CC BY-ND 2.5
+https://creativecommons.org/licenses/by-nd/2.5/,CC BY-ND 2.5
+http://creativecommons.org/licenses/by-nc-nd/2.5/,CC BY-NC-ND 2.5
+https://creativecommons.org/licenses/by-nc-nd/2.5/,CC BY-NC-ND 2.5
+http://creativecommons.org/licenses/by/2.1/,CC BY 2.1
+https://creativecommons.org/licenses/by/2.1/,CC BY 2.1
+http://creativecommons.org/licenses/by-sa/2.1/,CC BY-SA 2.1
+https://creativecommons.org/licenses/by-sa/2.1/,CC BY-SA 2.1
+http://creativecommons.org/licenses/by-nc/2.1/,CC BY-NC 2.1
+https://creativecommons.org/licenses/by-nc/2.1/,CC BY-NC 2.1
+http://creativecommons.org/licenses/by-nc-sa/2.1/,CC BY-NC-SA 2.1
+https://creativecommons.org/licenses/by-nc-sa/2.1/,CC BY-NC-SA 2.1
+http://creativecommons.org/licenses/by-nd/2.1/,CC BY-ND 2.1
+https://creativecommons.org/licenses/by-nd/2.1/,CC BY-ND 2.1
+http://creativecommons.org/licenses/by-nc-nd/2.1/,CC BY-NC-ND 2.1
+https://creativecommons.org/licenses/by-nc-nd/2.1/,CC BY-NC-ND 2.1
+http://creativecommons.org/licenses/by/2.0/,CC BY 2.0
+https://creativecommons.org/licenses/by/2.0/,CC BY 2.0
+http://creativecommons.org/licenses/by-sa/2.0/,CC BY-SA 2.0
+https://creativecommons.org/licenses/by-sa/2.0/,CC BY-SA 2.0
+http://creativecommons.org/licenses/by-nc/2.0/,CC BY-NC 2.0
+https://creativecommons.org/licenses/by-nc/2.0/,CC BY-NC 2.0
+http://creativecommons.org/licenses/by-nc-sa/2.0/,CC BY-NC-SA 2.0
+https://creativecommons.org/licenses/by-nc-sa/2.0/,CC BY-NC-SA 2.0
+http://creativecommons.org/licenses/by-nd/2.0/,CC BY-ND 2.0
+https://creativecommons.org/licenses/by-nd/2.0/,CC BY-ND 2.0
+http://creativecommons.org/licenses/by-nc-nd/2.0/,CC BY-NC-ND 2.0
+https://creativecommons.org/licenses/by-nc-nd/2.0/,CC BY-NC-ND 2.0
+http://creativecommons.org/licenses/by/1.0/,CC BY 1.0
+https://creativecommons.org/licenses/by/1.0/,CC BY 1.0
+http://creativecommons.org/licenses/by-sa/1.0/,CC BY-SA 1.0
+https://creativecommons.org/licenses/by-sa/1.0/,CC BY-SA 1.0
+http://creativecommons.org/licenses/by-nc/1.0/,CC BY-NC 1.0
+https://creativecommons.org/licenses/by-nc/1.0/,CC BY-NC 1.0
+http://creativecommons.org/licenses/by-nc-sa/1.0/,CC BY-NC-SA 1.0
+https://creativecommons.org/licenses/by-nc-sa/1.0/,CC BY-NC-SA 1.0
+http://creativecommons.org/licenses/by-nd/1.0/,CC BY-ND 1.0
+https://creativecommons.org/licenses/by-nd/1.0/,CC BY-ND 1.0
+http://creativecommons.org/licenses/by-nc-nd/1.0/,CC BY-NC-ND 1.0
+https://creativecommons.org/licenses/by-nc-nd/1.0/,CC BY-NC-ND 1.0
+http://creativecommons.org/publicdomain/zero/1.0/,CC0 1.0
+https://creativecommons.org/publicdomain/zero/1.0/,CC0 1.0
+http://creativecommons.org/publicdomain/mark/1.0/,PDM 1.0
+https://creativecommons.org/publicdomain/mark/1.0/,PDM 1.0
+http://www.gnu.org/licenses/gpl-3.0.html,GPL-3.0
+https://www.gnu.org/licenses/gpl-3.0.html,GPL-3.0
+http://www.gnu.org/licenses/agpl-3.0.html,AGPL-3.0
+https://www.gnu.org/licenses/agpl-3.0.html,AGPL-3.0
+http://www.gnu.org/licenses/lgpl-3.0.html,LGPL-3.0
+https://www.gnu.org/licenses/lgpl-3.0.html,LGPL-3.0
+http://www.gnu.org/licenses/gpl-2.0.html,GPL-2.0
+https://www.gnu.org/licenses/gpl-2.0.html,GPL-2.0
+http://www.gnu.org/licenses/lgpl-2.1.html,LGPL-2.1
+https://www.gnu.org/licenses/lgpl-2.1.html,LGPL-2.1
+http://opensource.org/licenses/MIT,MIT
+https://opensource.org/licenses/MIT,MIT
+http://opensource.org/licenses/Apache-2.0,Apache-2.0
+https://opensource.org/licenses/Apache-2.0,Apache-2.0
+http://opensource.org/licenses/BSD-3-Clause,BSD-3-Clause
+https://opensource.org/licenses/BSD-3-Clause,BSD-3-Clause
+http://opensource.org/licenses/BSD-2-Clause,BSD-2-Clause
+https://opensource.org/licenses/BSD-2-Clause,BSD-2-Clause
+http://opensource.org/licenses/ISC,ISC
+https://opensource.org/licenses/ISC,ISC
+http://opensource.org/licenses/Unlicense,Unlicense
+https://opensource.org/licenses/Unlicense,Unlicense
+http://www.opensource.org/licenses/Artistic-2.0,Artistic-2.0
+https://www.opensource.org/licenses/Artistic-2.0,Artistic-2.0
+http://www.opensource.org/licenses/MPL-2.0,MPL-2.0
+https://www.opensource.org/licenses/MPL-2.0,MPL-2.0
+http://www.opensource.org/licenses/EPL-2.0,EPL-2.0
+https://www.opensource.org/licenses/EPL-2.0,EPL-2.0
+http://www.opensource.org/licenses/CPL-1.0,CPL-1.0
+https://www.opensource.org/licenses/CPL-1.0,CPL-1.0
+
diff --git a/scripts/1-fetch/internetarchive_fetch.py b/scripts/1-fetch/internetarchive_fetch.py
new file mode 100644
index 00000000..2ab82224
--- /dev/null
+++ b/scripts/1-fetch/internetarchive_fetch.py
@@ -0,0 +1,491 @@
+#!/usr/bin/env python
+"""
+Fetch CC Legal Tool usage data from Internet Archive (IA) API.
+"""
+# Standard library
+import argparse
+import csv
+import os
+import sys
+import textwrap
+import time
+import traceback
+import urllib.parse
+from copy import copy
+
+# Third-party
+import requests
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+BASE_URL = "https://archive.org/advancedsearch.php"
+FILE1_COUNT = shared.path_join(PATHS["data_phase"], "ia_1_count.csv")
+FILE2_LANGUAGE = shared.path_join(PATHS["data_phase"], "ia_2_count_by_language.csv")
+FILE3_COUNTRY = shared.path_join(PATHS["data_phase"], "ia_3_count_by_country.csv")
+HEADER1_COUNT = ["LICENSE_URL", "NORMALIZED_LICENSE", "COUNT"]
+HEADER2_LANGUAGE = ["LICENSE_URL", "NORMALIZED_LICENSE", "LANGUAGE", "COUNT"]
+HEADER3_COUNTRY = ["LICENSE_URL", "NORMALIZED_LICENSE", "COUNTRY", "COUNT"]
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+# Log the start of the script execution
+LOGGER.info("Script execution started.")
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=1000,
+        help="Limit items per query (default: 1000)",
+    )
+    parser.add_argument(
+        "--max-items",
+        type=int,
+        default=100000,
+        help="Maximum total items to process (default: 100000)",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions (fetch, merge, add, commit, and push)",
+    )
+    parser.add_argument(
+        "--dev",
+        action="store_true",
+        help="Development mode: avoid hitting API (generate fake data)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    return args
+
+
+def get_requests_session():
+    """
+    Creates and returns a requests session with retry logic.
+    """
+    LOGGER.info("Setting up requests session with retry logic")
+    max_retries = Retry(
+        total=5,
+        backoff_factor=2,
+        status_forcelist=shared.STATUS_FORCELIST,
+    )
+    session = requests.Session()
+    session.mount("https://", HTTPAdapter(max_retries=max_retries))
+    session.headers.update({
+        "User-Agent": shared.USER_AGENT,
+        "Accept": "application/json",
+    })
+    return session
+
+
+def load_license_mapping():
+    """
+    Load the license mapping CSV file to normalize IA license URLs.
+    """
+    mapping_file = shared.path_join(PATHS["data"], "ia_license_mapping.csv")
+    license_mapping = {}
+    
+    if os.path.exists(mapping_file):
+        with open(mapping_file, "r", newline="", encoding="utf-8") as file_obj:
+            reader = csv.DictReader(file_obj, dialect="unix")
+            for row in reader:
+                license_mapping[row["IA_LICENSE_URL"]] = row["NORMALIZED_LICENSE"]
+    else:
+        LOGGER.warning(f"License mapping file not found: {mapping_file}")
+        LOGGER.info("Creating default license mapping file")
+        create_default_license_mapping(mapping_file)
+        # Reload after creating
+        with open(mapping_file, "r", newline="", encoding="utf-8") as file_obj:
+            reader = csv.DictReader(file_obj, dialect="unix")
+            for row in reader:
+                license_mapping[row["IA_LICENSE_URL"]] = row["NORMALIZED_LICENSE"]
+    
+    LOGGER.info(f"Loaded {len(license_mapping)} license mappings")
+    return license_mapping
+
+
+def create_default_license_mapping(mapping_file):
+    """
+    Create a default license mapping file with common CC license patterns.
+    """
+    os.makedirs(os.path.dirname(mapping_file), exist_ok=True)
+    
+    default_mappings = [
+        # Creative Commons licenses
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by/4.0/", "NORMALIZED_LICENSE": "CC BY 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by/4.0/", "NORMALIZED_LICENSE": "CC BY 4.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-SA 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-SA 4.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc/4.0/", "NORMALIZED_LICENSE": "CC BY-NC 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc/4.0/", "NORMALIZED_LICENSE": "CC BY-NC 4.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 4.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-ND 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-ND 4.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 4.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 4.0"},
+        
+        # CC0 and Public Domain
+        {"IA_LICENSE_URL": "http://creativecommons.org/publicdomain/zero/1.0/", "NORMALIZED_LICENSE": "CC0 1.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/publicdomain/zero/1.0/", "NORMALIZED_LICENSE": "CC0 1.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/publicdomain/mark/1.0/", "NORMALIZED_LICENSE": "PDM 1.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/publicdomain/mark/1.0/", "NORMALIZED_LICENSE": "PDM 1.0"},
+        
+        # Version 3.0 licenses
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by/3.0/", "NORMALIZED_LICENSE": "CC BY 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by/3.0/", "NORMALIZED_LICENSE": "CC BY 3.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-SA 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-SA 3.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc/3.0/", "NORMALIZED_LICENSE": "CC BY-NC 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc/3.0/", "NORMALIZED_LICENSE": "CC BY-NC 3.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 3.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-ND 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-ND 3.0"},
+        {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 3.0"},
+        {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 3.0"},
+        
+        # Other open licenses
+        {"IA_LICENSE_URL": "http://www.gnu.org/licenses/gpl-3.0.html", "NORMALIZED_LICENSE": "GPL-3.0"},
+        {"IA_LICENSE_URL": "https://www.gnu.org/licenses/gpl-3.0.html", "NORMALIZED_LICENSE": "GPL-3.0"},
+        {"IA_LICENSE_URL": "http://www.gnu.org/licenses/agpl-3.0.html", "NORMALIZED_LICENSE": "AGPL-3.0"},
+        {"IA_LICENSE_URL": "https://www.gnu.org/licenses/agpl-3.0.html", "NORMALIZED_LICENSE": "AGPL-3.0"},
+        {"IA_LICENSE_URL": "http://www.gnu.org/licenses/lgpl-3.0.html", "NORMALIZED_LICENSE": "LGPL-3.0"},
+        {"IA_LICENSE_URL": "https://www.gnu.org/licenses/lgpl-3.0.html", "NORMALIZED_LICENSE": "LGPL-3.0"},
+        {"IA_LICENSE_URL": "http://opensource.org/licenses/MIT", "NORMALIZED_LICENSE": "MIT"},
+        {"IA_LICENSE_URL": "https://opensource.org/licenses/MIT", "NORMALIZED_LICENSE": "MIT"},
+        {"IA_LICENSE_URL": "http://opensource.org/licenses/Apache-2.0", "NORMALIZED_LICENSE": "Apache-2.0"},
+        {"IA_LICENSE_URL": "https://opensource.org/licenses/Apache-2.0", "NORMALIZED_LICENSE": "Apache-2.0"},
+        {"IA_LICENSE_URL": "http://opensource.org/licenses/BSD-3-Clause", "NORMALIZED_LICENSE": "BSD-3-Clause"},
+        {"IA_LICENSE_URL": "https://opensource.org/licenses/BSD-3-Clause", "NORMALIZED_LICENSE": "BSD-3-Clause"},
+    ]
+    
+    with open(mapping_file, "w", newline="", encoding="utf-8") as file_obj:
+        writer = csv.DictWriter(file_obj, fieldnames=["IA_LICENSE_URL", "NORMALIZED_LICENSE"], dialect="unix")
+        writer.writeheader()
+        writer.writerows(default_mappings)
+    
+    LOGGER.info(f"Created default license mapping file: {mapping_file}")
+
+
+def initialize_data_files(args):
+    """
+    Initialize all data files for IA data collection.
+    """
+    if not args.enable_save:
+        return
+
+    # Create data directory for this phase
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+
+    # Initialize count file
+    if not os.path.isfile(FILE1_COUNT):
+        with open(FILE1_COUNT, "w", newline="", encoding="utf-8") as file_obj:
+            writer = csv.DictWriter(file_obj, fieldnames=HEADER1_COUNT, dialect="unix")
+            writer.writeheader()
+
+    # Initialize language file
+    if not os.path.isfile(FILE2_LANGUAGE):
+        with open(FILE2_LANGUAGE, "w", newline="", encoding="utf-8") as file_obj:
+            writer = csv.DictWriter(file_obj, fieldnames=HEADER2_LANGUAGE, dialect="unix")
+            writer.writeheader()
+
+    # Initialize country file
+    if not os.path.isfile(FILE3_COUNTRY):
+        with open(FILE3_COUNTRY, "w", newline="", encoding="utf-8") as file_obj:
+            writer = csv.DictWriter(file_obj, fieldnames=HEADER3_COUNTRY, dialect="unix")
+            writer.writeheader()
+
+
+def query_ia_api(args, session, license_mapping, offset=0):
+    """
+    Query the Internet Archive API for CC-licensed items.
+    """
+    LOGGER.info(f"Querying IA API with offset {offset}")
+    
+    # Build search query for Creative Commons licenses
+    query_params = {
+        "q": "licenseurl:creativecommons.org OR licenseurl:gnu.org OR licenseurl:opensource.org",
+        "fl": "identifier,licenseurl,language,country,mediatype",
+        "rows": args.limit,
+        "start": offset,
+        "output": "json",
+        "sort": "identifier asc"
+    }
+    
+    if args.dev:
+        # Generate fake data for development
+        LOGGER.info("Development mode: generating fake data")
+        return generate_fake_data(license_mapping, args.limit)
+    
+    try:
+        response = session.get(BASE_URL, params=query_params, timeout=30)
+        response.raise_for_status()
+        data = response.json()
+        
+        if "response" not in data:
+            LOGGER.error(f"Unexpected API response structure: {data}")
+            return []
+        
+        docs = data["response"].get("docs", [])
+        total_found = data["response"].get("numFound", 0)
+        
+        LOGGER.info(f"Retrieved {len(docs)} items (total found: {total_found})")
+        return docs, total_found
+        
+    except requests.exceptions.RequestException as e:
+        LOGGER.error(f"Error querying IA API: {e}")
+        return [], 0
+
+
+def generate_fake_data(license_mapping, count):
+    """
+    Generate fake data for development/testing purposes.
+    """
+    import random
+    
+    fake_licenses = list(license_mapping.values())[:10]  # Use first 10 licenses
+    fake_languages = ["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ar"]
+    fake_countries = ["US", "GB", "CA", "AU", "DE", "FR", "IT", "ES", "BR", "MX"]
+    
+    docs = []
+    for i in range(count):
+        docs.append({
+            "identifier": f"fake_item_{i:06d}",
+            "licenseurl": random.choice(list(license_mapping.keys())),
+            "language": random.choice(fake_languages),
+            "country": random.choice(fake_countries),
+            "mediatype": random.choice(["texts", "audio", "video", "image", "software"])
+        })
+    
+    return docs, count
+
+
+def normalize_license(license_url, license_mapping):
+    """
+    Normalize a license URL using the mapping table.
+    """
+    if not license_url:
+        return "Unknown"
+    
+    # Direct lookup
+    if license_url in license_mapping:
+        return license_mapping[license_url]
+    
+    # Try to match partial URLs
+    for ia_url, normalized in license_mapping.items():
+        if ia_url in license_url or license_url in ia_url:
+            return normalized
+    
+    # If no match found, return a cleaned version of the URL
+    return license_url.split("/")[-2] if "/" in license_url else license_url
+
+
+def process_ia_data(args, docs, license_mapping):
+    """
+    Process IA API results and aggregate data.
+    """
+    LOGGER.info(f"Processing {len(docs)} IA items")
+    
+    # Initialize counters
+    license_counts = {}
+    language_counts = {}
+    country_counts = {}
+    
+    for doc in docs:
+        license_url = doc.get("licenseurl", "")
+        normalized_license = normalize_license(license_url, license_mapping)
+        language = doc.get("language", "Unknown")
+        country = doc.get("country", "Unknown")
+        
+        # Count by license
+        key = (license_url, normalized_license)
+        license_counts[key] = license_counts.get(key, 0) + 1
+        
+        # Count by language
+        lang_key = (license_url, normalized_license, language)
+        language_counts[lang_key] = language_counts.get(lang_key, 0) + 1
+        
+        # Count by country
+        country_key = (license_url, normalized_license, country)
+        country_counts[country_key] = country_counts.get(country_key, 0) + 1
+    
+    # Write data to files
+    write_license_data(args, license_counts)
+    write_language_data(args, language_counts)
+    write_country_data(args, country_counts)
+    
+    return len(docs)
+
+
+def write_license_data(args, license_counts):
+    """
+    Write license count data to CSV file.
+    """
+    if not args.enable_save:
+        return
+    
+    with open(FILE1_COUNT, "a", newline="", encoding="utf-8") as file_obj:
+        writer = csv.DictWriter(file_obj, fieldnames=HEADER1_COUNT, dialect="unix")
+        for (license_url, normalized_license), count in license_counts.items():
+            writer.writerow({
+                "LICENSE_URL": license_url,
+                "NORMALIZED_LICENSE": normalized_license,
+                "COUNT": count
+            })
+
+
+def write_language_data(args, language_counts):
+    """
+    Write language count data to CSV file.
+    """
+    if not args.enable_save:
+        return
+    
+    with open(FILE2_LANGUAGE, "a", newline="", encoding="utf-8") as file_obj:
+        writer = csv.DictWriter(file_obj, fieldnames=HEADER2_LANGUAGE, dialect="unix")
+        for (license_url, normalized_license, language), count in language_counts.items():
+            writer.writerow({
+                "LICENSE_URL": license_url,
+                "NORMALIZED_LICENSE": normalized_license,
+                "LANGUAGE": language,
+                "COUNT": count
+            })
+
+
+def write_country_data(args, country_counts):
+    """
+    Write country count data to CSV file.
+    """
+    if not args.enable_save:
+        return
+    
+    with open(FILE3_COUNTRY, "a", newline="", encoding="utf-8") as file_obj:
+        writer = csv.DictWriter(file_obj, fieldnames=HEADER3_COUNTRY, dialect="unix")
+        for (license_url, normalized_license, country), count in country_counts.items():
+            writer.writerow({
+                "LICENSE_URL": license_url,
+                "NORMALIZED_LICENSE": normalized_license,
+                "COUNTRY": country,
+                "COUNT": count
+            })
+
+
+def main():
+    """
+    Main function to orchestrate IA data collection.
+    """
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    
+    # Load license mapping
+    license_mapping = load_license_mapping()
+    
+    # Initialize data files
+    initialize_data_files(args)
+    
+    # Setup session
+    session = get_requests_session()
+    
+    # Fetch and process data
+    total_processed = 0
+    offset = 0
+    
+    while total_processed < args.max_items:
+        remaining = args.max_items - total_processed
+        current_limit = min(args.limit, remaining)
+        
+        LOGGER.info(f"Processing batch: offset={offset}, limit={current_limit}")
+        
+        if args.dev:
+            docs, total_found = query_ia_api(args, session, license_mapping, offset)
+        else:
+            docs, total_found = query_ia_api(args, session, license_mapping, offset)
+        
+        if not docs:
+            LOGGER.info("No more data available")
+            break
+        
+        # Process the batch
+        batch_processed = process_ia_data(args, docs, license_mapping)
+        total_processed += batch_processed
+        
+        LOGGER.info(f"Processed {batch_processed} items (total: {total_processed})")
+        
+        # Check if we've reached the end
+        if len(docs) < current_limit:
+            LOGGER.info("Reached end of available data")
+            break
+        
+        # Update offset for next batch
+        offset += len(docs)
+        
+        # Rate limiting
+        if not args.dev:
+            time.sleep(1)  # Be respectful to the API
+    
+    LOGGER.info(f"Data collection completed. Total items processed: {total_processed}")
+    
+    # Git operations
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit new Internet Archive (IA) data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)
+
diff --git a/scripts/2-process/internetarchive_process.py b/scripts/2-process/internetarchive_process.py
new file mode 100644
index 00000000..6c7b2e1e
--- /dev/null
+++ b/scripts/2-process/internetarchive_process.py
@@ -0,0 +1,484 @@
+#!/usr/bin/env python
+"""
+Process Internet Archive (IA) data.
+"""
+# Standard library
+import argparse
+import csv
+import os
+import sys
+import textwrap
+import traceback
+
+# Third-party
+import pandas as pd
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+
+
+def parse_arguments():
+    """
+    Parse command-line options, returns parsed argument namespace.
+    """
+    LOGGER.info("Parsing command-line options")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def data_to_csv(args, data, file_path):
+    """
+    Save data to CSV file.
+    """
+    if not args.enable_save:
+        return
+    os.makedirs(PATHS["data_phase"], exist_ok=True)
+    # emulate csv.unix_dialect
+    data.to_csv(
+        file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n"
+    )
+
+
+def process_license_totals(args, count_data):
+    """
+    Processing count data: totals by normalized license
+    """
+    LOGGER.info(process_license_totals.__doc__.strip())
+    
+    # Group by normalized license and sum counts
+    data = count_data.groupby(["NORMALIZED_LICENSE"], as_index=False)["COUNT"].sum()
+    data = data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    
+    file_path = shared.path_join(PATHS["data_phase"], "ia_license_totals.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_cc_license_totals(args, count_data):
+    """
+    Processing count data: totals by CC license categories
+    """
+    LOGGER.info(process_cc_license_totals.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    # Categorize by license type
+    data = {
+        "Licenses version 4.0": 0,
+        "Licenses version 3.0": 0,
+        "Licenses version 2.x": 0,
+        "Licenses version 1.0": 0,
+        "CC0 1.0": 0,
+        "Public Domain Mark 1.0": 0,
+        "Other CC licenses": 0,
+    }
+    
+    for row in cc_data.itertuples(index=False):
+        license_name = row[1]  # NORMALIZED_LICENSE
+        count = row[2]  # COUNT
+        
+        if license_name.startswith("PDM"):
+            key = "Public Domain Mark 1.0"
+        elif "CC0" in license_name:
+            key = "CC0 1.0"
+        elif "4.0" in license_name:
+            key = "Licenses version 4.0"
+        elif "3.0" in license_name:
+            key = "Licenses version 3.0"
+        elif "2." in license_name:
+            key = "Licenses version 2.x"
+        elif "1.0" in license_name:
+            key = "Licenses version 1.0"
+        else:
+            key = "Other CC licenses"
+        
+        data[key] += count
+
+    data = pd.DataFrame(
+        data.items(), columns=["CC legal tool product", "Count"]
+    )
+    file_path = shared.path_join(PATHS["data_phase"], "ia_cc_product_totals.csv")
+    data_to_csv(args, data, file_path)
+
+
+def process_latest_prior_retired_totals(args, count_data):
+    """
+    Process count data: totals by unit in three categories: latest, prior,
+    and retired for CC licenses
+    """
+    LOGGER.info(process_latest_prior_retired_totals.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    # https://creativecommons.org/retiredlicenses/
+    retired = [
+        "CC DEVNATIONS ",
+        "CC ND ",
+        "CC ND-NC ",
+        "CC NC ",
+        "CC NC-SAMPLING+",
+        "CC NC-SA ",
+        "CC PUBLICDOMAIN",
+        "CC SAMPLING ",
+        "CC SAMPLING+ ",
+        "CC SA ",
+    ]
+    
+    data = {"latest": {}, "prior": {}, "retired": {}}
+    status = {"Latest": 0, "Prior": 0, "Retired": 0}
+    
+    for row in cc_data.itertuples(index=False):
+        license_name = row[1]  # NORMALIZED_LICENSE
+        count = row[2]  # COUNT
+        
+        tool_begin = False
+        for version in ["1.0", "2.0", "2.1", "2.5", "3.0", "4.0"]:
+            if version in license_name:
+                separator = license_name.index(version)
+                # everything before version (including space)
+                tool_begin = license_name[:separator]
+        
+        if not tool_begin:
+            tool_begin = license_name
+        
+        # Latest
+        if (
+            ("BY" in license_name and "4.0" in license_name)
+            or license_name.startswith("CC0")
+            or license_name.startswith("PDM")
+        ):
+            try:
+                data["latest"][license_name] += count
+            except KeyError:
+                data["latest"][license_name] = count
+            status["Latest"] += count
+        # Prior
+        elif "BY" in license_name and tool_begin not in retired:
+            if "ND-NC" in tool_begin:
+                tool_begin = tool_begin.replace("ND-NC", "NC-ND")
+            try:
+                data["prior"][tool_begin.strip()] += count
+            except KeyError:
+                data["prior"][tool_begin.strip()] = count
+            status["Prior"] += count
+        # Retired
+        else:
+            try:
+                data["retired"][tool_begin.strip()] += count
+            except KeyError:
+                data["retired"][tool_begin.strip()] = count
+            status["Retired"] += count
+    
+    data["combined"] = status
+
+    for key, value_data in data.items():
+        dataframe = pd.DataFrame(
+            value_data.items(), columns=["CC legal tool", "Count"]
+        )
+        file_path = shared.path_join(
+            PATHS["data_phase"], f"ia_cc_status_{key}_totals.csv"
+        )
+        data_to_csv(args, dataframe, file_path)
+
+
+def process_totals_by_free_cultural(args, count_data):
+    """
+    Processing count data: totals by Approved for Free Cultural Works
+    """
+    LOGGER.info(process_totals_by_free_cultural.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    data = {
+        "Approved for Free Cultural Works": 0,
+        "Limited use": 0,
+    }
+    
+    for row in cc_data.itertuples(index=False):
+        license_name = row[1]  # NORMALIZED_LICENSE
+        count = row[2]  # COUNT
+        
+        if license_name.startswith("PDM") or "CC0" in license_name or "PUBLICDOMAIN" in license_name:
+            key = "Approved for Free Cultural Works"
+        else:
+            parts = license_name.split()
+            if len(parts) > 1:
+                unit = parts[1].lower()
+                if unit in ["by-sa", "by", "sa", "sampling+"]:
+                    key = "Approved for Free Cultural Works"
+                else:
+                    key = "Limited use"
+            else:
+                key = "Limited use"
+        
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    data.sort_values("Count", ascending=False, inplace=True)
+    data.reset_index(drop=True, inplace=True)
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_totals_by_free_cultural.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_restrictions(args, count_data):
+    """
+    Processing count data: totals by restriction level
+    """
+    LOGGER.info(process_totals_by_restrictions.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    data = {
+        "level 0 - unrestricted": 0,
+        "level 1 - few restrictions": 0,
+        "level 2 - some restrictions": 0,
+        "level 3 - many restrictions": 0,
+    }
+    
+    for row in cc_data.itertuples(index=False):
+        license_name = row[1]  # NORMALIZED_LICENSE
+        count = row[2]  # COUNT
+        
+        if license_name.startswith("PDM") or "CC0" in license_name or "PUBLICDOMAIN" in license_name:
+            key = "level 0 - unrestricted"
+        else:
+            parts = license_name.split()
+            if len(parts) > 1:
+                unit = parts[1].lower()
+                if unit in ["by-sa", "by", "sa", "sampling+"]:
+                    key = "level 1 - few restrictions"
+                elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]:
+                    key = "level 2 - some restrictions"
+                else:
+                    key = "level 3 - many restrictions"
+            else:
+                key = "level 3 - many restrictions"
+        
+        data[key] += count
+
+    data = pd.DataFrame(data.items(), columns=["Category", "Count"])
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_totals_by_restrictions.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_language(args, language_data):
+    """
+    Processing language data: totals by language
+    """
+    LOGGER.info(process_totals_by_language.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_language_data = language_data[language_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    data = cc_language_data.groupby(["LANGUAGE"], as_index=False)["COUNT"].sum()
+    data = data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    data.rename(
+        columns={
+            "LANGUAGE": "Language",
+            "COUNT": "Count",
+        },
+        inplace=True,
+    )
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_totals_by_language.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_totals_by_country(args, country_data):
+    """
+    Processing country data: totals by country
+    """
+    LOGGER.info(process_totals_by_country.__doc__.strip())
+    
+    # Filter for Creative Commons licenses only
+    cc_country_data = country_data[country_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    data = cc_country_data.groupby(["COUNTRY"], as_index=False)["COUNT"].sum()
+    data = data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    data.rename(
+        columns={
+            "COUNTRY": "Country",
+            "COUNT": "Count",
+        },
+        inplace=True,
+    )
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_totals_by_country.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_media_type_totals(args, count_data):
+    """
+    Processing count data: totals by media type
+    """
+    LOGGER.info(process_media_type_totals.__doc__.strip())
+    
+    # This would require media type data from the fetch phase
+    # For now, we'll create a placeholder
+    data = pd.DataFrame({
+        "Media Type": ["texts", "audio", "video", "image", "software", "other"],
+        "Count": [0, 0, 0, 0, 0, 0]
+    })
+    
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_media_type_totals.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def process_open_source_totals(args, count_data):
+    """
+    Processing count data: totals by open source licenses (non-CC)
+    """
+    LOGGER.info(process_open_source_totals.__doc__.strip())
+    
+    # Filter for non-CC licenses
+    open_source_data = count_data[~count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)]
+    
+    data = open_source_data.groupby(["NORMALIZED_LICENSE"], as_index=False)["COUNT"].sum()
+    data = data.sort_values("COUNT", ascending=False)
+    data.reset_index(drop=True, inplace=True)
+    data.rename(
+        columns={
+            "NORMALIZED_LICENSE": "License",
+            "COUNT": "Count",
+        },
+        inplace=True,
+    )
+    file_path = shared.path_join(
+        PATHS["data_phase"], "ia_open_source_totals.csv"
+    )
+    data_to_csv(args, data, file_path)
+
+
+def main():
+    """
+    Main function to process IA data.
+    """
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    # Count data
+    file1_count = shared.path_join(PATHS["data_1-fetch"], "ia_1_count.csv")
+    if not os.path.exists(file1_count):
+        LOGGER.error(f"Count data file not found: {file1_count}")
+        return
+    
+    count_data = pd.read_csv(file1_count, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "COUNT"])
+    
+    # Process various aggregations
+    process_license_totals(args, count_data)
+    process_cc_license_totals(args, count_data)
+    process_latest_prior_retired_totals(args, count_data)
+    process_totals_by_free_cultural(args, count_data)
+    process_totals_by_restrictions(args, count_data)
+    process_open_source_totals(args, count_data)
+    process_media_type_totals(args, count_data)
+
+    # Language data
+    file2_language = shared.path_join(
+        PATHS["data_1-fetch"], "ia_2_count_by_language.csv"
+    )
+    if os.path.exists(file2_language):
+        language_data = pd.read_csv(
+            file2_language, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "LANGUAGE", "COUNT"]
+        )
+        process_totals_by_language(args, language_data)
+
+    # Country data
+    file3_country = shared.path_join(
+        PATHS["data_1-fetch"], "ia_3_count_by_country.csv"
+    )
+    if os.path.exists(file3_country):
+        country_data = pd.read_csv(
+            file3_country, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "COUNTRY", "COUNT"]
+        )
+        process_totals_by_country(args, country_data)
+
+    # Git operations
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit processed Internet Archive (IA) data for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)
+
diff --git a/scripts/3-report/internetarchive_report.py b/scripts/3-report/internetarchive_report.py
new file mode 100644
index 00000000..f0d2b611
--- /dev/null
+++ b/scripts/3-report/internetarchive_report.py
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+"""
+This file is dedicated to visualizing and analyzing the data collected
+from Internet Archive (IA).
+"""
+# Standard library
+import argparse
+import os
+import sys
+import textwrap
+import traceback
+
+# Third-party
+import pandas as pd
+from pygments import highlight
+from pygments.formatters import TerminalFormatter
+from pygments.lexers import PythonTracebackLexer
+
+# Add parent directory so shared can be imported
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+
+# First-party/Local
+import plot  # noqa: E402
+import shared  # noqa: E402
+
+# Setup
+LOGGER, PATHS = shared.setup(__file__)
+
+# Constants
+QUARTER = os.path.basename(PATHS["data_quarter"])
+SECTION = "Internet Archive (IA)"
+
+
+def parse_arguments():
+    """
+    Parses command-line arguments, returns parsed arguments.
+    """
+    LOGGER.info("Parsing command-line arguments")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--quarter",
+        default=QUARTER,
+        help=f"Data quarter in format YYYYQx (default: {QUARTER})",
+    )
+    parser.add_argument(
+        "--show-plots",
+        action="store_true",
+        help="Show generated plots (default: False)",
+    )
+    parser.add_argument(
+        "--enable-save",
+        action="store_true",
+        help="Enable saving results (default: False)",
+    )
+    parser.add_argument(
+        "--enable-git",
+        action="store_true",
+        help="Enable git actions such as fetch, merge, add, commit, and push"
+        " (default: False)",
+    )
+    args = parser.parse_args()
+    if not args.enable_save and args.enable_git:
+        parser.error("--enable-git requires --enable-save")
+    if args.quarter != QUARTER:
+        global PATHS
+        PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter)
+    args.logger = LOGGER
+    args.paths = PATHS
+    return args
+
+
+def ia_intro(args):
+    """
+    Write Internet Archive (IA) introduction.
+    """
+    LOGGER.info(ia_intro.__doc__.strip())
+    
+    # Try to get total count from license totals
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_license_totals.csv",
+    )
+    
+    if os.path.exists(file_path):
+        LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+        data = pd.read_csv(file_path)
+        total_count = f"{data['COUNT'].sum():,d}"
+    else:
+        total_count = "N/A"
+    
+    shared.update_readme(
+        args,
+        SECTION,
+        "Overview",
+        None,
+        None,
+        "Internet Archive (IA) data uses the Advanced Search API to query for"
+        " items with Creative Commons and open source licenses. The data includes"
+        " license information, language, country, and media type metadata.\n"
+        "\n"
+        f"**The results indicate there are a total of {total_count} items in the"
+        " Internet Archive that are licensed or in the public domain using"
+        " Creative Commons or open source legal tools.**\n"
+        "\n"
+        "Thank you Internet Archive for providing access to this valuable"
+        " cultural heritage data!\n",
+    )
+
+
+def plot_cc_products(args):
+    """
+    Create plots for CC legal tool product totals and percentages
+    """
+    LOGGER.info(plot_cc_products.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"], "ia_cc_product_totals.csv"
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "CC legal tool product"
+    data = pd.read_csv(file_path, index_col=name_label)
+    data = data[::-1]  # reverse order
+
+    title = "CC Products totals and percentages"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+        bar_xscale="log",
+        bar_ylabel=name_label,
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_product_totals.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Creative Commons (CC) legal tool product totals and"
+        " percentages from Internet Archive.",
+    )
+
+
+def plot_cc_tool_status(args):
+    """
+    Create plots for the CC legal tool status totals and percentages
+    """
+    LOGGER.info(plot_cc_tool_status.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_cc_status_combined_totals.csv",
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "CC legal tool"
+    data = pd.read_csv(file_path, index_col=name_label)
+    
+    # Check if data is empty
+    if data.empty or data['Count'].sum() == 0:
+        LOGGER.warning(f"No data found in {file_path}, skipping plot")
+        return
+    
+    data.sort_values(name_label, ascending=False, inplace=True)
+
+    title = "CC legal tools status"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+        bar_xscale="log",
+        bar_ylabel="CC legal tool status",
+    )
+
+    image_path = shared.path_join(PATHS["data_phase"], "ia_cc_tool_status.png")
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Creative Commons (CC) legal tool status totals and"
+        " percentages from Internet Archive.",
+    )
+
+
+def plot_latest_tools(args):
+    """
+    Create plots for latest CC legal tool totals and percentages
+    """
+    LOGGER.info(plot_latest_tools.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_cc_status_latest_totals.csv",
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "CC legal tool"
+    data = pd.read_csv(file_path, index_col=name_label)
+    
+    # Check if data is empty
+    if data.empty or data['Count'].sum() == 0:
+        LOGGER.warning(f"No data found in {file_path}, skipping plot")
+        return
+    
+    data.sort_values(name_label, ascending=False, inplace=True)
+
+    title = "Latest CC legal tools"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_status_latest_tools.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing latest Creative Commons (CC) legal tool totals and"
+        " percentages from Internet Archive.",
+    )
+
+
+def plot_prior_tools(args):
+    """
+    Create plots for prior CC legal tool totals and percentages
+    """
+    LOGGER.info(plot_prior_tools.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"], "ia_cc_status_prior_totals.csv"
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "CC legal tool"
+    data = pd.read_csv(file_path, index_col=name_label)
+    
+    # Check if data is empty
+    if data.empty or data['Count'].sum() == 0:
+        LOGGER.warning(f"No data found in {file_path}, skipping plot")
+        return
+    
+    data.sort_values(name_label, ascending=False, inplace=True)
+
+    title = "Prior CC legal tools"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_status_prior_tools.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing prior Creative Commons (CC) legal tool totals and"
+        " percentages from Internet Archive.",
+        "The unit names have been normalized (~~`CC BY-ND-NC`~~ =>"
+        " `CC BY-NC-ND`).",
+    )
+
+
+def plot_retired_tools(args):
+    """
+    Create plots for retired CC legal tool totals and percentages
+    """
+    LOGGER.info(plot_retired_tools.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_cc_status_retired_totals.csv",
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "CC legal tool"
+    data = pd.read_csv(file_path, index_col=name_label)
+    
+    # Check if data is empty
+    if data.empty or data['Count'].sum() == 0:
+        LOGGER.warning(f"No data found in {file_path}, skipping plot")
+        return
+    
+    data.sort_values(name_label, ascending=False, inplace=True)
+
+    title = "Retired CC legal tools"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+        bar_xscale="log",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_status_retired_tools.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing retired Creative Commons (CC) legal tools total and"
+        " percentages from Internet Archive.",
+        "For more information on retired legal tools, see [Retired Legal Tools"
+        " - Creative Commons](https://creativecommons.org/retiredlicenses/).",
+    )
+
+
+def plot_countries_highest_usage(args):
+    """
+    Create plots for the countries with highest usage of latest tools
+    """
+    LOGGER.info(plot_countries_highest_usage.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"], "ia_cc_totals_by_country.csv"
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Country"
+    data_label = "Count"
+    data = pd.read_csv(file_path, index_col=name_label)
+    total_count = f"{data['Count'].sum():,d}"
+    data.sort_values(data_label, ascending=False, inplace=True)
+    data = data[:10]  # limit to highest 10
+    data = data[::-1]  # reverse order
+
+    title = "Countries with highest usage of CC tools"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+        bar_xscale="log",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_countries_highest_usage.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing countries with the highest usage of Creative Commons"
+        " (CC) legal tools from Internet Archive.",
+        f"The complete data set indicates there are a total of {total_count}"
+        " items using CC legal tools in the Internet Archive.",
+    )
+
+
+def plot_languages_highest_usage(args):
+    """
+    Create plots for the languages with highest usage of CC tools
+    """
+    LOGGER.info(plot_languages_highest_usage.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"], "ia_cc_totals_by_language.csv"
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Language"
+    data_label = "Count"
+    data = pd.read_csv(file_path, index_col=name_label)
+    total_count = f"{data['Count'].sum():,d}"
+    data.sort_values(data_label, ascending=False, inplace=True)
+    data = data[:10]  # limit to highest 10
+    data = data[::-1]  # reverse order
+
+    title = "Languages with highest usage of CC tools"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+        bar_xscale="log",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_cc_languages_highest_usage.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing languages with the highest usage of Creative Commons"
+        " (CC) legal tools from Internet Archive.",
+        f"The complete data set indicates there are a total of {total_count}"
+        " items using CC legal tools in the Internet Archive.",
+    )
+
+
+def plot_free_culture(args):
+    """
+    Create plots for Approved for Free Cultural Works
+    """
+    LOGGER.info(plot_free_culture.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_cc_totals_by_free_cultural.csv",
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "Category"
+    data_label = "Count"
+    data = pd.read_csv(file_path, index_col=name_label)
+
+    title = "Approved for Free Cultural Works"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label=data_label,
+    )
+
+    image_path = shared.path_join(PATHS["data_phase"], "ia_cc_free_culture.png")
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing Approved for Free Cultural Works legal tool usage"
+        " from Internet Archive.",
+        "[Understanding Free Cultural Works - Creative"
+        " Commons](https://creativecommons.org/public-domain/freeworks/):\n"
+        "\n"
+        '> Using [the Freedom Defined definition of a "Free Cultural Work"],'
+        " material licensed under CC BY or BY-SA is a free cultural work. (So"
+        " is anything in the worldwide public domain marked with CC0 or the"
+        " Public Domain Mark.) CC's other licenses– BY-NC, BY-ND, BY-NC-SA,"
+        " and BY-NC-ND–only allow more limited uses, and material under these"
+        " licenses is not considered a free cultural work.",
+    )
+
+
+def plot_open_source_licenses(args):
+    """
+    Create plots for open source licenses (non-CC)
+    """
+    LOGGER.info(plot_open_source_licenses.__doc__.strip())
+    file_path = shared.path_join(
+        PATHS["data_2-process"],
+        "ia_open_source_totals.csv",
+    )
+    
+    if not os.path.exists(file_path):
+        LOGGER.warning(f"Data file not found: {file_path}")
+        return
+    
+    LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
+    name_label = "License"
+    data = pd.read_csv(file_path, index_col=name_label)
+    data.sort_values("Count", ascending=False, inplace=True)
+    data = data[:10]  # limit to top 10
+    data = data[::-1]  # reverse order
+
+    title = "Open Source Licenses (Non-CC)"
+    plt = plot.combined_plot(
+        args=args,
+        data=data,
+        title=title,
+        name_label=name_label,
+        data_label="Count",
+        bar_xscale="log",
+    )
+
+    image_path = shared.path_join(
+        PATHS["data_phase"], "ia_open_source_licenses.png"
+    )
+    LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}")
+
+    if args.enable_save:
+        # Create the directory if it does not exist
+        os.makedirs(PATHS["data_phase"], exist_ok=True)
+        plt.savefig(image_path)
+
+    shared.update_readme(
+        args,
+        SECTION,
+        title,
+        image_path,
+        "Plots showing open source license usage from Internet Archive"
+        " (excluding Creative Commons licenses).",
+    )
+
+
+def main():
+    """
+    Main function to generate IA reports.
+    """
+    args = parse_arguments()
+    shared.paths_log(LOGGER, PATHS)
+    shared.git_fetch_and_merge(args, PATHS["repo"])
+
+    ia_intro(args)
+    plot_cc_products(args)
+    plot_cc_tool_status(args)
+    plot_latest_tools(args)
+    plot_prior_tools(args)
+    plot_retired_tools(args)
+    plot_countries_highest_usage(args)
+    plot_languages_highest_usage(args)
+    plot_free_culture(args)
+    plot_open_source_licenses(args)
+
+    args = shared.git_add_and_commit(
+        args,
+        PATHS["repo"],
+        PATHS["data_quarter"],
+        f"Add and commit Internet Archive (IA) reports for {QUARTER}",
+    )
+    shared.git_push_changes(args, PATHS["repo"])
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except shared.QuantifyingException as e:
+        if e.exit_code == 0:
+            LOGGER.info(e.message)
+        else:
+            LOGGER.error(e.message)
+        sys.exit(e.exit_code)
+    except SystemExit as e:
+        if e.code != 0:
+            LOGGER.error(f"System exit with code: {e.code}")
+        sys.exit(e.code)
+    except KeyboardInterrupt:
+        LOGGER.info("(130) Halted via KeyboardInterrupt.")
+        sys.exit(130)
+    except Exception:
+        traceback_formatted = textwrap.indent(
+            highlight(
+                traceback.format_exc(),
+                PythonTracebackLexer(),
+                TerminalFormatter(),
+            ),
+            "    ",
+        )
+        LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}")
+        sys.exit(1)
diff --git a/test_ia_pipeline.py b/test_ia_pipeline.py
new file mode 100644
index 00000000..f9b17d68
--- /dev/null
+++ b/test_ia_pipeline.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+"""
+Test script for the Internet Archive (IA) pipeline.
+This script tests the complete IA data collection, processing, and reporting pipeline.
+"""
+import os
+import sys
+import subprocess
+import tempfile
+import shutil
+
+def run_command(cmd, cwd=None):
+    """Run a command and return the result."""
+    print(f"Running: {cmd}")
+    result = subprocess.run(cmd, shell=True, cwd=cwd, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"Error running command: {cmd}")
+        print(f"STDOUT: {result.stdout}")
+        print(f"STDERR: {result.stderr}")
+    return result
+
+def test_ia_pipeline():
+    """Test the complete IA pipeline."""
+    print("Testing Internet Archive (IA) Pipeline")
+    print("=" * 50)
+    
+    # Get the current directory (should be the project root)
+    project_root = os.getcwd()
+    print(f"Project root: {project_root}")
+    
+    # Test 1: Check if all required files exist
+    print("\n1. Checking required files...")
+    required_files = [
+        "scripts/1-fetch/internetarchive_fetch.py",
+        "scripts/2-process/internetarchive_process.py", 
+        "scripts/3-report/internetarchive_report.py",
+        "data/ia_license_mapping.csv"
+    ]
+    
+    for file_path in required_files:
+        full_path = os.path.join(project_root, file_path)
+        if os.path.exists(full_path):
+            print(f"[OK] {file_path}")
+        else:
+            print(f"[FAIL] {file_path} - MISSING")
+            return False
+    
+    # Test 2: Test fetch script with development mode
+    print("\n2. Testing fetch script (development mode)...")
+    fetch_cmd = "pipenv run python scripts/1-fetch/internetarchive_fetch.py --dev --enable-save --limit 10 --max-items 50"
+    result = run_command(fetch_cmd, cwd=project_root)
+    
+    if result.returncode != 0:
+        print("[FAIL] Fetch script failed")
+        return False
+    else:
+        print("[OK] Fetch script completed successfully")
+    
+    # Test 3: Check if fetch data files were created
+    print("\n3. Checking fetch data files...")
+    data_files = [
+        "data/2025Q4/1-fetch/ia_1_count.csv",
+        "data/2025Q4/1-fetch/ia_2_count_by_language.csv", 
+        "data/2025Q4/1-fetch/ia_3_count_by_country.csv"
+    ]
+    
+    for file_path in data_files:
+        full_path = os.path.join(project_root, file_path)
+        if os.path.exists(full_path):
+            print(f"[OK] {file_path}")
+            # Check if file has content
+            with open(full_path, 'r') as f:
+                lines = f.readlines()
+                if len(lines) > 1:  # More than just header
+                    print(f"  - Contains {len(lines)-1} data rows")
+                else:
+                    print(f"  - Warning: Only header row found")
+        else:
+            print(f"[FAIL] {file_path} - NOT CREATED")
+    
+    # Test 4: Test process script
+    print("\n4. Testing process script...")
+    process_cmd = "pipenv run python scripts/2-process/internetarchive_process.py --enable-save"
+    result = run_command(process_cmd, cwd=project_root)
+    
+    if result.returncode != 0:
+        print("[FAIL] Process script failed")
+        return False
+    else:
+        print("[OK] Process script completed successfully")
+    
+    # Test 5: Check if processed data files were created
+    print("\n5. Checking processed data files...")
+    processed_files = [
+        "data/2025Q4/2-process/ia_license_totals.csv",
+        "data/2025Q4/2-process/ia_cc_product_totals.csv",
+        "data/2025Q4/2-process/ia_cc_status_combined_totals.csv",
+        "data/2025Q4/2-process/ia_cc_status_latest_totals.csv",
+        "data/2025Q4/2-process/ia_cc_status_prior_totals.csv",
+        "data/2025Q4/2-process/ia_cc_status_retired_totals.csv",
+        "data/2025Q4/2-process/ia_cc_totals_by_free_cultural.csv",
+        "data/2025Q4/2-process/ia_cc_totals_by_restrictions.csv",
+        "data/2025Q4/2-process/ia_open_source_totals.csv"
+    ]
+    
+    for file_path in processed_files:
+        full_path = os.path.join(project_root, file_path)
+        if os.path.exists(full_path):
+            print(f"[OK] {file_path}")
+        else:
+            print(f"[FAIL] {file_path} - NOT CREATED")
+    
+    # Test 6: Test report script
+    print("\n6. Testing report script...")
+    report_cmd = "pipenv run python scripts/3-report/internetarchive_report.py --enable-save"
+    result = run_command(report_cmd, cwd=project_root)
+    
+    if result.returncode != 0:
+        print("[FAIL] Report script failed")
+        return False
+    else:
+        print("[OK] Report script completed successfully")
+    
+    # Test 7: Check if report files were created
+    print("\n7. Checking report files...")
+    report_files = [
+        "data/2025Q4/3-report/ia_cc_product_totals.png",
+        "data/2025Q4/3-report/ia_cc_tool_status.png",
+        "data/2025Q4/3-report/ia_cc_status_latest_tools.png",
+        "data/2025Q4/3-report/ia_cc_status_prior_tools.png",
+        # "data/2025Q4/3-report/ia_cc_status_retired_tools.png",  # May not exist if no retired tools data
+        "data/2025Q4/3-report/ia_cc_countries_highest_usage.png",
+        "data/2025Q4/3-report/ia_cc_languages_highest_usage.png",
+        "data/2025Q4/3-report/ia_cc_free_culture.png",
+        "data/2025Q4/3-report/ia_open_source_licenses.png"
+    ]
+    
+    for file_path in report_files:
+        full_path = os.path.join(project_root, file_path)
+        if os.path.exists(full_path):
+            print(f"[OK] {file_path}")
+        else:
+            print(f"[FAIL] {file_path} - NOT CREATED")
+    
+    # Test 8: Check if README was updated
+    print("\n8. Checking README update...")
+    readme_path = os.path.join(project_root, "data/2025Q4/README.md")
+    if os.path.exists(readme_path):
+        print(f"[OK] README.md created")
+        with open(readme_path, 'r') as f:
+            content = f.read()
+            if "Internet Archive (IA)" in content:
+                print("[OK] Internet Archive section found in README")
+            else:
+                print("[FAIL] Internet Archive section not found in README")
+    else:
+        print("[FAIL] README.md not created")
+    
+    print("\n" + "=" * 50)
+    print("IA Pipeline Test Complete!")
+    print("=" * 50)
+    
+    return True
+
+if __name__ == "__main__":
+    success = test_ia_pipeline()
+    if success:
+        print("\n[OK] All tests passed!")
+        sys.exit(0)
+    else:
+        print("\n[FAIL] Some tests failed!")
+        sys.exit(1)