From 26ec2a21f64401a631e47aeceb1bf79c987aadf9 Mon Sep 17 00:00:00 2001 From: Mukesh-Ghildiyal Date: Wed, 29 Oct 2025 22:20:43 +0530 Subject: [PATCH] internet archive --- data/ia_license_mapping.csv | 108 ++++ scripts/1-fetch/internetarchive_fetch.py | 491 ++++++++++++++ scripts/2-process/internetarchive_process.py | 484 ++++++++++++++ scripts/3-report/internetarchive_report.py | 647 +++++++++++++++++++ test_ia_pipeline.py | 172 +++++ 5 files changed, 1902 insertions(+) create mode 100644 data/ia_license_mapping.csv create mode 100644 scripts/1-fetch/internetarchive_fetch.py create mode 100644 scripts/2-process/internetarchive_process.py create mode 100644 scripts/3-report/internetarchive_report.py create mode 100644 test_ia_pipeline.py diff --git a/data/ia_license_mapping.csv b/data/ia_license_mapping.csv new file mode 100644 index 00000000..97646dff --- /dev/null +++ b/data/ia_license_mapping.csv @@ -0,0 +1,108 @@ +IA_LICENSE_URL,NORMALIZED_LICENSE +http://creativecommons.org/licenses/by/4.0/,CC BY 4.0 +https://creativecommons.org/licenses/by/4.0/,CC BY 4.0 +http://creativecommons.org/licenses/by-sa/4.0/,CC BY-SA 4.0 +https://creativecommons.org/licenses/by-sa/4.0/,CC BY-SA 4.0 +http://creativecommons.org/licenses/by-nc/4.0/,CC BY-NC 4.0 +https://creativecommons.org/licenses/by-nc/4.0/,CC BY-NC 4.0 +http://creativecommons.org/licenses/by-nc-sa/4.0/,CC BY-NC-SA 4.0 +https://creativecommons.org/licenses/by-nc-sa/4.0/,CC BY-NC-SA 4.0 +http://creativecommons.org/licenses/by-nd/4.0/,CC BY-ND 4.0 +https://creativecommons.org/licenses/by-nd/4.0/,CC BY-ND 4.0 +http://creativecommons.org/licenses/by-nc-nd/4.0/,CC BY-NC-ND 4.0 +https://creativecommons.org/licenses/by-nc-nd/4.0/,CC BY-NC-ND 4.0 +http://creativecommons.org/licenses/by/3.0/,CC BY 3.0 +https://creativecommons.org/licenses/by/3.0/,CC BY 3.0 +http://creativecommons.org/licenses/by-sa/3.0/,CC BY-SA 3.0 +https://creativecommons.org/licenses/by-sa/3.0/,CC BY-SA 3.0 +http://creativecommons.org/licenses/by-nc/3.0/,CC BY-NC 3.0 +https://creativecommons.org/licenses/by-nc/3.0/,CC BY-NC 3.0 +http://creativecommons.org/licenses/by-nc-sa/3.0/,CC BY-NC-SA 3.0 +https://creativecommons.org/licenses/by-nc-sa/3.0/,CC BY-NC-SA 3.0 +http://creativecommons.org/licenses/by-nd/3.0/,CC BY-ND 3.0 +https://creativecommons.org/licenses/by-nd/3.0/,CC BY-ND 3.0 +http://creativecommons.org/licenses/by-nc-nd/3.0/,CC BY-NC-ND 3.0 +https://creativecommons.org/licenses/by-nc-nd/3.0/,CC BY-NC-ND 3.0 +http://creativecommons.org/licenses/by/2.5/,CC BY 2.5 +https://creativecommons.org/licenses/by/2.5/,CC BY 2.5 +http://creativecommons.org/licenses/by-sa/2.5/,CC BY-SA 2.5 +https://creativecommons.org/licenses/by-sa/2.5/,CC BY-SA 2.5 +http://creativecommons.org/licenses/by-nc/2.5/,CC BY-NC 2.5 +https://creativecommons.org/licenses/by-nc/2.5/,CC BY-NC 2.5 +http://creativecommons.org/licenses/by-nc-sa/2.5/,CC BY-NC-SA 2.5 +https://creativecommons.org/licenses/by-nc-sa/2.5/,CC BY-NC-SA 2.5 +http://creativecommons.org/licenses/by-nd/2.5/,CC BY-ND 2.5 +https://creativecommons.org/licenses/by-nd/2.5/,CC BY-ND 2.5 +http://creativecommons.org/licenses/by-nc-nd/2.5/,CC BY-NC-ND 2.5 +https://creativecommons.org/licenses/by-nc-nd/2.5/,CC BY-NC-ND 2.5 +http://creativecommons.org/licenses/by/2.1/,CC BY 2.1 +https://creativecommons.org/licenses/by/2.1/,CC BY 2.1 +http://creativecommons.org/licenses/by-sa/2.1/,CC BY-SA 2.1 +https://creativecommons.org/licenses/by-sa/2.1/,CC BY-SA 2.1 +http://creativecommons.org/licenses/by-nc/2.1/,CC BY-NC 2.1 +https://creativecommons.org/licenses/by-nc/2.1/,CC BY-NC 2.1 +http://creativecommons.org/licenses/by-nc-sa/2.1/,CC BY-NC-SA 2.1 +https://creativecommons.org/licenses/by-nc-sa/2.1/,CC BY-NC-SA 2.1 +http://creativecommons.org/licenses/by-nd/2.1/,CC BY-ND 2.1 +https://creativecommons.org/licenses/by-nd/2.1/,CC BY-ND 2.1 +http://creativecommons.org/licenses/by-nc-nd/2.1/,CC BY-NC-ND 2.1 +https://creativecommons.org/licenses/by-nc-nd/2.1/,CC BY-NC-ND 2.1 +http://creativecommons.org/licenses/by/2.0/,CC BY 2.0 +https://creativecommons.org/licenses/by/2.0/,CC BY 2.0 +http://creativecommons.org/licenses/by-sa/2.0/,CC BY-SA 2.0 +https://creativecommons.org/licenses/by-sa/2.0/,CC BY-SA 2.0 +http://creativecommons.org/licenses/by-nc/2.0/,CC BY-NC 2.0 +https://creativecommons.org/licenses/by-nc/2.0/,CC BY-NC 2.0 +http://creativecommons.org/licenses/by-nc-sa/2.0/,CC BY-NC-SA 2.0 +https://creativecommons.org/licenses/by-nc-sa/2.0/,CC BY-NC-SA 2.0 +http://creativecommons.org/licenses/by-nd/2.0/,CC BY-ND 2.0 +https://creativecommons.org/licenses/by-nd/2.0/,CC BY-ND 2.0 +http://creativecommons.org/licenses/by-nc-nd/2.0/,CC BY-NC-ND 2.0 +https://creativecommons.org/licenses/by-nc-nd/2.0/,CC BY-NC-ND 2.0 +http://creativecommons.org/licenses/by/1.0/,CC BY 1.0 +https://creativecommons.org/licenses/by/1.0/,CC BY 1.0 +http://creativecommons.org/licenses/by-sa/1.0/,CC BY-SA 1.0 +https://creativecommons.org/licenses/by-sa/1.0/,CC BY-SA 1.0 +http://creativecommons.org/licenses/by-nc/1.0/,CC BY-NC 1.0 +https://creativecommons.org/licenses/by-nc/1.0/,CC BY-NC 1.0 +http://creativecommons.org/licenses/by-nc-sa/1.0/,CC BY-NC-SA 1.0 +https://creativecommons.org/licenses/by-nc-sa/1.0/,CC BY-NC-SA 1.0 +http://creativecommons.org/licenses/by-nd/1.0/,CC BY-ND 1.0 +https://creativecommons.org/licenses/by-nd/1.0/,CC BY-ND 1.0 +http://creativecommons.org/licenses/by-nc-nd/1.0/,CC BY-NC-ND 1.0 +https://creativecommons.org/licenses/by-nc-nd/1.0/,CC BY-NC-ND 1.0 +http://creativecommons.org/publicdomain/zero/1.0/,CC0 1.0 +https://creativecommons.org/publicdomain/zero/1.0/,CC0 1.0 +http://creativecommons.org/publicdomain/mark/1.0/,PDM 1.0 +https://creativecommons.org/publicdomain/mark/1.0/,PDM 1.0 +http://www.gnu.org/licenses/gpl-3.0.html,GPL-3.0 +https://www.gnu.org/licenses/gpl-3.0.html,GPL-3.0 +http://www.gnu.org/licenses/agpl-3.0.html,AGPL-3.0 +https://www.gnu.org/licenses/agpl-3.0.html,AGPL-3.0 +http://www.gnu.org/licenses/lgpl-3.0.html,LGPL-3.0 +https://www.gnu.org/licenses/lgpl-3.0.html,LGPL-3.0 +http://www.gnu.org/licenses/gpl-2.0.html,GPL-2.0 +https://www.gnu.org/licenses/gpl-2.0.html,GPL-2.0 +http://www.gnu.org/licenses/lgpl-2.1.html,LGPL-2.1 +https://www.gnu.org/licenses/lgpl-2.1.html,LGPL-2.1 +http://opensource.org/licenses/MIT,MIT +https://opensource.org/licenses/MIT,MIT +http://opensource.org/licenses/Apache-2.0,Apache-2.0 +https://opensource.org/licenses/Apache-2.0,Apache-2.0 +http://opensource.org/licenses/BSD-3-Clause,BSD-3-Clause +https://opensource.org/licenses/BSD-3-Clause,BSD-3-Clause +http://opensource.org/licenses/BSD-2-Clause,BSD-2-Clause +https://opensource.org/licenses/BSD-2-Clause,BSD-2-Clause +http://opensource.org/licenses/ISC,ISC +https://opensource.org/licenses/ISC,ISC +http://opensource.org/licenses/Unlicense,Unlicense +https://opensource.org/licenses/Unlicense,Unlicense +http://www.opensource.org/licenses/Artistic-2.0,Artistic-2.0 +https://www.opensource.org/licenses/Artistic-2.0,Artistic-2.0 +http://www.opensource.org/licenses/MPL-2.0,MPL-2.0 +https://www.opensource.org/licenses/MPL-2.0,MPL-2.0 +http://www.opensource.org/licenses/EPL-2.0,EPL-2.0 +https://www.opensource.org/licenses/EPL-2.0,EPL-2.0 +http://www.opensource.org/licenses/CPL-1.0,CPL-1.0 +https://www.opensource.org/licenses/CPL-1.0,CPL-1.0 + diff --git a/scripts/1-fetch/internetarchive_fetch.py b/scripts/1-fetch/internetarchive_fetch.py new file mode 100644 index 00000000..2ab82224 --- /dev/null +++ b/scripts/1-fetch/internetarchive_fetch.py @@ -0,0 +1,491 @@ +#!/usr/bin/env python +""" +Fetch CC Legal Tool usage data from Internet Archive (IA) API. +""" +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import time +import traceback +import urllib.parse +from copy import copy + +# Third-party +import requests +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +BASE_URL = "https://archive.org/advancedsearch.php" +FILE1_COUNT = shared.path_join(PATHS["data_phase"], "ia_1_count.csv") +FILE2_LANGUAGE = shared.path_join(PATHS["data_phase"], "ia_2_count_by_language.csv") +FILE3_COUNTRY = shared.path_join(PATHS["data_phase"], "ia_3_count_by_country.csv") +HEADER1_COUNT = ["LICENSE_URL", "NORMALIZED_LICENSE", "COUNT"] +HEADER2_LANGUAGE = ["LICENSE_URL", "NORMALIZED_LICENSE", "LANGUAGE", "COUNT"] +HEADER3_COUNTRY = ["LICENSE_URL", "NORMALIZED_LICENSE", "COUNTRY", "COUNT"] +QUARTER = os.path.basename(PATHS["data_quarter"]) + +# Log the start of the script execution +LOGGER.info("Script execution started.") + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--limit", + type=int, + default=1000, + help="Limit items per query (default: 1000)", + ) + parser.add_argument( + "--max-items", + type=int, + default=100000, + help="Maximum total items to process (default: 100000)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions (fetch, merge, add, commit, and push)", + ) + parser.add_argument( + "--dev", + action="store_true", + help="Development mode: avoid hitting API (generate fake data)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + return args + + +def get_requests_session(): + """ + Creates and returns a requests session with retry logic. + """ + LOGGER.info("Setting up requests session with retry logic") + max_retries = Retry( + total=5, + backoff_factor=2, + status_forcelist=shared.STATUS_FORCELIST, + ) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=max_retries)) + session.headers.update({ + "User-Agent": shared.USER_AGENT, + "Accept": "application/json", + }) + return session + + +def load_license_mapping(): + """ + Load the license mapping CSV file to normalize IA license URLs. + """ + mapping_file = shared.path_join(PATHS["data"], "ia_license_mapping.csv") + license_mapping = {} + + if os.path.exists(mapping_file): + with open(mapping_file, "r", newline="", encoding="utf-8") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + for row in reader: + license_mapping[row["IA_LICENSE_URL"]] = row["NORMALIZED_LICENSE"] + else: + LOGGER.warning(f"License mapping file not found: {mapping_file}") + LOGGER.info("Creating default license mapping file") + create_default_license_mapping(mapping_file) + # Reload after creating + with open(mapping_file, "r", newline="", encoding="utf-8") as file_obj: + reader = csv.DictReader(file_obj, dialect="unix") + for row in reader: + license_mapping[row["IA_LICENSE_URL"]] = row["NORMALIZED_LICENSE"] + + LOGGER.info(f"Loaded {len(license_mapping)} license mappings") + return license_mapping + + +def create_default_license_mapping(mapping_file): + """ + Create a default license mapping file with common CC license patterns. + """ + os.makedirs(os.path.dirname(mapping_file), exist_ok=True) + + default_mappings = [ + # Creative Commons licenses + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by/4.0/", "NORMALIZED_LICENSE": "CC BY 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by/4.0/", "NORMALIZED_LICENSE": "CC BY 4.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-SA 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-SA 4.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc/4.0/", "NORMALIZED_LICENSE": "CC BY-NC 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc/4.0/", "NORMALIZED_LICENSE": "CC BY-NC 4.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-sa/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 4.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-ND 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-ND 4.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 4.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-nd/4.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 4.0"}, + + # CC0 and Public Domain + {"IA_LICENSE_URL": "http://creativecommons.org/publicdomain/zero/1.0/", "NORMALIZED_LICENSE": "CC0 1.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/publicdomain/zero/1.0/", "NORMALIZED_LICENSE": "CC0 1.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/publicdomain/mark/1.0/", "NORMALIZED_LICENSE": "PDM 1.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/publicdomain/mark/1.0/", "NORMALIZED_LICENSE": "PDM 1.0"}, + + # Version 3.0 licenses + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by/3.0/", "NORMALIZED_LICENSE": "CC BY 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by/3.0/", "NORMALIZED_LICENSE": "CC BY 3.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-SA 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-SA 3.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc/3.0/", "NORMALIZED_LICENSE": "CC BY-NC 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc/3.0/", "NORMALIZED_LICENSE": "CC BY-NC 3.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-sa/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-SA 3.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-ND 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-ND 3.0"}, + {"IA_LICENSE_URL": "http://creativecommons.org/licenses/by-nc-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 3.0"}, + {"IA_LICENSE_URL": "https://creativecommons.org/licenses/by-nc-nd/3.0/", "NORMALIZED_LICENSE": "CC BY-NC-ND 3.0"}, + + # Other open licenses + {"IA_LICENSE_URL": "http://www.gnu.org/licenses/gpl-3.0.html", "NORMALIZED_LICENSE": "GPL-3.0"}, + {"IA_LICENSE_URL": "https://www.gnu.org/licenses/gpl-3.0.html", "NORMALIZED_LICENSE": "GPL-3.0"}, + {"IA_LICENSE_URL": "http://www.gnu.org/licenses/agpl-3.0.html", "NORMALIZED_LICENSE": "AGPL-3.0"}, + {"IA_LICENSE_URL": "https://www.gnu.org/licenses/agpl-3.0.html", "NORMALIZED_LICENSE": "AGPL-3.0"}, + {"IA_LICENSE_URL": "http://www.gnu.org/licenses/lgpl-3.0.html", "NORMALIZED_LICENSE": "LGPL-3.0"}, + {"IA_LICENSE_URL": "https://www.gnu.org/licenses/lgpl-3.0.html", "NORMALIZED_LICENSE": "LGPL-3.0"}, + {"IA_LICENSE_URL": "http://opensource.org/licenses/MIT", "NORMALIZED_LICENSE": "MIT"}, + {"IA_LICENSE_URL": "https://opensource.org/licenses/MIT", "NORMALIZED_LICENSE": "MIT"}, + {"IA_LICENSE_URL": "http://opensource.org/licenses/Apache-2.0", "NORMALIZED_LICENSE": "Apache-2.0"}, + {"IA_LICENSE_URL": "https://opensource.org/licenses/Apache-2.0", "NORMALIZED_LICENSE": "Apache-2.0"}, + {"IA_LICENSE_URL": "http://opensource.org/licenses/BSD-3-Clause", "NORMALIZED_LICENSE": "BSD-3-Clause"}, + {"IA_LICENSE_URL": "https://opensource.org/licenses/BSD-3-Clause", "NORMALIZED_LICENSE": "BSD-3-Clause"}, + ] + + with open(mapping_file, "w", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=["IA_LICENSE_URL", "NORMALIZED_LICENSE"], dialect="unix") + writer.writeheader() + writer.writerows(default_mappings) + + LOGGER.info(f"Created default license mapping file: {mapping_file}") + + +def initialize_data_files(args): + """ + Initialize all data files for IA data collection. + """ + if not args.enable_save: + return + + # Create data directory for this phase + os.makedirs(PATHS["data_phase"], exist_ok=True) + + # Initialize count file + if not os.path.isfile(FILE1_COUNT): + with open(FILE1_COUNT, "w", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER1_COUNT, dialect="unix") + writer.writeheader() + + # Initialize language file + if not os.path.isfile(FILE2_LANGUAGE): + with open(FILE2_LANGUAGE, "w", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER2_LANGUAGE, dialect="unix") + writer.writeheader() + + # Initialize country file + if not os.path.isfile(FILE3_COUNTRY): + with open(FILE3_COUNTRY, "w", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER3_COUNTRY, dialect="unix") + writer.writeheader() + + +def query_ia_api(args, session, license_mapping, offset=0): + """ + Query the Internet Archive API for CC-licensed items. + """ + LOGGER.info(f"Querying IA API with offset {offset}") + + # Build search query for Creative Commons licenses + query_params = { + "q": "licenseurl:creativecommons.org OR licenseurl:gnu.org OR licenseurl:opensource.org", + "fl": "identifier,licenseurl,language,country,mediatype", + "rows": args.limit, + "start": offset, + "output": "json", + "sort": "identifier asc" + } + + if args.dev: + # Generate fake data for development + LOGGER.info("Development mode: generating fake data") + return generate_fake_data(license_mapping, args.limit) + + try: + response = session.get(BASE_URL, params=query_params, timeout=30) + response.raise_for_status() + data = response.json() + + if "response" not in data: + LOGGER.error(f"Unexpected API response structure: {data}") + return [] + + docs = data["response"].get("docs", []) + total_found = data["response"].get("numFound", 0) + + LOGGER.info(f"Retrieved {len(docs)} items (total found: {total_found})") + return docs, total_found + + except requests.exceptions.RequestException as e: + LOGGER.error(f"Error querying IA API: {e}") + return [], 0 + + +def generate_fake_data(license_mapping, count): + """ + Generate fake data for development/testing purposes. + """ + import random + + fake_licenses = list(license_mapping.values())[:10] # Use first 10 licenses + fake_languages = ["en", "es", "fr", "de", "it", "pt", "ru", "zh", "ja", "ar"] + fake_countries = ["US", "GB", "CA", "AU", "DE", "FR", "IT", "ES", "BR", "MX"] + + docs = [] + for i in range(count): + docs.append({ + "identifier": f"fake_item_{i:06d}", + "licenseurl": random.choice(list(license_mapping.keys())), + "language": random.choice(fake_languages), + "country": random.choice(fake_countries), + "mediatype": random.choice(["texts", "audio", "video", "image", "software"]) + }) + + return docs, count + + +def normalize_license(license_url, license_mapping): + """ + Normalize a license URL using the mapping table. + """ + if not license_url: + return "Unknown" + + # Direct lookup + if license_url in license_mapping: + return license_mapping[license_url] + + # Try to match partial URLs + for ia_url, normalized in license_mapping.items(): + if ia_url in license_url or license_url in ia_url: + return normalized + + # If no match found, return a cleaned version of the URL + return license_url.split("/")[-2] if "/" in license_url else license_url + + +def process_ia_data(args, docs, license_mapping): + """ + Process IA API results and aggregate data. + """ + LOGGER.info(f"Processing {len(docs)} IA items") + + # Initialize counters + license_counts = {} + language_counts = {} + country_counts = {} + + for doc in docs: + license_url = doc.get("licenseurl", "") + normalized_license = normalize_license(license_url, license_mapping) + language = doc.get("language", "Unknown") + country = doc.get("country", "Unknown") + + # Count by license + key = (license_url, normalized_license) + license_counts[key] = license_counts.get(key, 0) + 1 + + # Count by language + lang_key = (license_url, normalized_license, language) + language_counts[lang_key] = language_counts.get(lang_key, 0) + 1 + + # Count by country + country_key = (license_url, normalized_license, country) + country_counts[country_key] = country_counts.get(country_key, 0) + 1 + + # Write data to files + write_license_data(args, license_counts) + write_language_data(args, language_counts) + write_country_data(args, country_counts) + + return len(docs) + + +def write_license_data(args, license_counts): + """ + Write license count data to CSV file. + """ + if not args.enable_save: + return + + with open(FILE1_COUNT, "a", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER1_COUNT, dialect="unix") + for (license_url, normalized_license), count in license_counts.items(): + writer.writerow({ + "LICENSE_URL": license_url, + "NORMALIZED_LICENSE": normalized_license, + "COUNT": count + }) + + +def write_language_data(args, language_counts): + """ + Write language count data to CSV file. + """ + if not args.enable_save: + return + + with open(FILE2_LANGUAGE, "a", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER2_LANGUAGE, dialect="unix") + for (license_url, normalized_license, language), count in language_counts.items(): + writer.writerow({ + "LICENSE_URL": license_url, + "NORMALIZED_LICENSE": normalized_license, + "LANGUAGE": language, + "COUNT": count + }) + + +def write_country_data(args, country_counts): + """ + Write country count data to CSV file. + """ + if not args.enable_save: + return + + with open(FILE3_COUNTRY, "a", newline="", encoding="utf-8") as file_obj: + writer = csv.DictWriter(file_obj, fieldnames=HEADER3_COUNTRY, dialect="unix") + for (license_url, normalized_license, country), count in country_counts.items(): + writer.writerow({ + "LICENSE_URL": license_url, + "NORMALIZED_LICENSE": normalized_license, + "COUNTRY": country, + "COUNT": count + }) + + +def main(): + """ + Main function to orchestrate IA data collection. + """ + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + + # Load license mapping + license_mapping = load_license_mapping() + + # Initialize data files + initialize_data_files(args) + + # Setup session + session = get_requests_session() + + # Fetch and process data + total_processed = 0 + offset = 0 + + while total_processed < args.max_items: + remaining = args.max_items - total_processed + current_limit = min(args.limit, remaining) + + LOGGER.info(f"Processing batch: offset={offset}, limit={current_limit}") + + if args.dev: + docs, total_found = query_ia_api(args, session, license_mapping, offset) + else: + docs, total_found = query_ia_api(args, session, license_mapping, offset) + + if not docs: + LOGGER.info("No more data available") + break + + # Process the batch + batch_processed = process_ia_data(args, docs, license_mapping) + total_processed += batch_processed + + LOGGER.info(f"Processed {batch_processed} items (total: {total_processed})") + + # Check if we've reached the end + if len(docs) < current_limit: + LOGGER.info("Reached end of available data") + break + + # Update offset for next batch + offset += len(docs) + + # Rate limiting + if not args.dev: + time.sleep(1) # Be respectful to the API + + LOGGER.info(f"Data collection completed. Total items processed: {total_processed}") + + # Git operations + shared.git_fetch_and_merge(args, PATHS["repo"]) + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit new Internet Archive (IA) data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) + diff --git a/scripts/2-process/internetarchive_process.py b/scripts/2-process/internetarchive_process.py new file mode 100644 index 00000000..6c7b2e1e --- /dev/null +++ b/scripts/2-process/internetarchive_process.py @@ -0,0 +1,484 @@ +#!/usr/bin/env python +""" +Process Internet Archive (IA) data. +""" +# Standard library +import argparse +import csv +import os +import sys +import textwrap +import traceback + +# Third-party +import pandas as pd +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) + + +def parse_arguments(): + """ + Parse command-line options, returns parsed argument namespace. + """ + LOGGER.info("Parsing command-line options") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def data_to_csv(args, data, file_path): + """ + Save data to CSV file. + """ + if not args.enable_save: + return + os.makedirs(PATHS["data_phase"], exist_ok=True) + # emulate csv.unix_dialect + data.to_csv( + file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" + ) + + +def process_license_totals(args, count_data): + """ + Processing count data: totals by normalized license + """ + LOGGER.info(process_license_totals.__doc__.strip()) + + # Group by normalized license and sum counts + data = count_data.groupby(["NORMALIZED_LICENSE"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + + file_path = shared.path_join(PATHS["data_phase"], "ia_license_totals.csv") + data_to_csv(args, data, file_path) + + +def process_cc_license_totals(args, count_data): + """ + Processing count data: totals by CC license categories + """ + LOGGER.info(process_cc_license_totals.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + # Categorize by license type + data = { + "Licenses version 4.0": 0, + "Licenses version 3.0": 0, + "Licenses version 2.x": 0, + "Licenses version 1.0": 0, + "CC0 1.0": 0, + "Public Domain Mark 1.0": 0, + "Other CC licenses": 0, + } + + for row in cc_data.itertuples(index=False): + license_name = row[1] # NORMALIZED_LICENSE + count = row[2] # COUNT + + if license_name.startswith("PDM"): + key = "Public Domain Mark 1.0" + elif "CC0" in license_name: + key = "CC0 1.0" + elif "4.0" in license_name: + key = "Licenses version 4.0" + elif "3.0" in license_name: + key = "Licenses version 3.0" + elif "2." in license_name: + key = "Licenses version 2.x" + elif "1.0" in license_name: + key = "Licenses version 1.0" + else: + key = "Other CC licenses" + + data[key] += count + + data = pd.DataFrame( + data.items(), columns=["CC legal tool product", "Count"] + ) + file_path = shared.path_join(PATHS["data_phase"], "ia_cc_product_totals.csv") + data_to_csv(args, data, file_path) + + +def process_latest_prior_retired_totals(args, count_data): + """ + Process count data: totals by unit in three categories: latest, prior, + and retired for CC licenses + """ + LOGGER.info(process_latest_prior_retired_totals.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + # https://creativecommons.org/retiredlicenses/ + retired = [ + "CC DEVNATIONS ", + "CC ND ", + "CC ND-NC ", + "CC NC ", + "CC NC-SAMPLING+", + "CC NC-SA ", + "CC PUBLICDOMAIN", + "CC SAMPLING ", + "CC SAMPLING+ ", + "CC SA ", + ] + + data = {"latest": {}, "prior": {}, "retired": {}} + status = {"Latest": 0, "Prior": 0, "Retired": 0} + + for row in cc_data.itertuples(index=False): + license_name = row[1] # NORMALIZED_LICENSE + count = row[2] # COUNT + + tool_begin = False + for version in ["1.0", "2.0", "2.1", "2.5", "3.0", "4.0"]: + if version in license_name: + separator = license_name.index(version) + # everything before version (including space) + tool_begin = license_name[:separator] + + if not tool_begin: + tool_begin = license_name + + # Latest + if ( + ("BY" in license_name and "4.0" in license_name) + or license_name.startswith("CC0") + or license_name.startswith("PDM") + ): + try: + data["latest"][license_name] += count + except KeyError: + data["latest"][license_name] = count + status["Latest"] += count + # Prior + elif "BY" in license_name and tool_begin not in retired: + if "ND-NC" in tool_begin: + tool_begin = tool_begin.replace("ND-NC", "NC-ND") + try: + data["prior"][tool_begin.strip()] += count + except KeyError: + data["prior"][tool_begin.strip()] = count + status["Prior"] += count + # Retired + else: + try: + data["retired"][tool_begin.strip()] += count + except KeyError: + data["retired"][tool_begin.strip()] = count + status["Retired"] += count + + data["combined"] = status + + for key, value_data in data.items(): + dataframe = pd.DataFrame( + value_data.items(), columns=["CC legal tool", "Count"] + ) + file_path = shared.path_join( + PATHS["data_phase"], f"ia_cc_status_{key}_totals.csv" + ) + data_to_csv(args, dataframe, file_path) + + +def process_totals_by_free_cultural(args, count_data): + """ + Processing count data: totals by Approved for Free Cultural Works + """ + LOGGER.info(process_totals_by_free_cultural.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + data = { + "Approved for Free Cultural Works": 0, + "Limited use": 0, + } + + for row in cc_data.itertuples(index=False): + license_name = row[1] # NORMALIZED_LICENSE + count = row[2] # COUNT + + if license_name.startswith("PDM") or "CC0" in license_name or "PUBLICDOMAIN" in license_name: + key = "Approved for Free Cultural Works" + else: + parts = license_name.split() + if len(parts) > 1: + unit = parts[1].lower() + if unit in ["by-sa", "by", "sa", "sampling+"]: + key = "Approved for Free Cultural Works" + else: + key = "Limited use" + else: + key = "Limited use" + + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + data.sort_values("Count", ascending=False, inplace=True) + data.reset_index(drop=True, inplace=True) + file_path = shared.path_join( + PATHS["data_phase"], "ia_cc_totals_by_free_cultural.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_restrictions(args, count_data): + """ + Processing count data: totals by restriction level + """ + LOGGER.info(process_totals_by_restrictions.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_data = count_data[count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + data = { + "level 0 - unrestricted": 0, + "level 1 - few restrictions": 0, + "level 2 - some restrictions": 0, + "level 3 - many restrictions": 0, + } + + for row in cc_data.itertuples(index=False): + license_name = row[1] # NORMALIZED_LICENSE + count = row[2] # COUNT + + if license_name.startswith("PDM") or "CC0" in license_name or "PUBLICDOMAIN" in license_name: + key = "level 0 - unrestricted" + else: + parts = license_name.split() + if len(parts) > 1: + unit = parts[1].lower() + if unit in ["by-sa", "by", "sa", "sampling+"]: + key = "level 1 - few restrictions" + elif unit in ["by-nc", "by-nc-sa", "sampling", "nc", "nc-sa"]: + key = "level 2 - some restrictions" + else: + key = "level 3 - many restrictions" + else: + key = "level 3 - many restrictions" + + data[key] += count + + data = pd.DataFrame(data.items(), columns=["Category", "Count"]) + file_path = shared.path_join( + PATHS["data_phase"], "ia_cc_totals_by_restrictions.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_language(args, language_data): + """ + Processing language data: totals by language + """ + LOGGER.info(process_totals_by_language.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_language_data = language_data[language_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + data = cc_language_data.groupby(["LANGUAGE"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data.rename( + columns={ + "LANGUAGE": "Language", + "COUNT": "Count", + }, + inplace=True, + ) + file_path = shared.path_join( + PATHS["data_phase"], "ia_cc_totals_by_language.csv" + ) + data_to_csv(args, data, file_path) + + +def process_totals_by_country(args, country_data): + """ + Processing country data: totals by country + """ + LOGGER.info(process_totals_by_country.__doc__.strip()) + + # Filter for Creative Commons licenses only + cc_country_data = country_data[country_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + data = cc_country_data.groupby(["COUNTRY"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data.rename( + columns={ + "COUNTRY": "Country", + "COUNT": "Count", + }, + inplace=True, + ) + file_path = shared.path_join( + PATHS["data_phase"], "ia_cc_totals_by_country.csv" + ) + data_to_csv(args, data, file_path) + + +def process_media_type_totals(args, count_data): + """ + Processing count data: totals by media type + """ + LOGGER.info(process_media_type_totals.__doc__.strip()) + + # This would require media type data from the fetch phase + # For now, we'll create a placeholder + data = pd.DataFrame({ + "Media Type": ["texts", "audio", "video", "image", "software", "other"], + "Count": [0, 0, 0, 0, 0, 0] + }) + + file_path = shared.path_join( + PATHS["data_phase"], "ia_media_type_totals.csv" + ) + data_to_csv(args, data, file_path) + + +def process_open_source_totals(args, count_data): + """ + Processing count data: totals by open source licenses (non-CC) + """ + LOGGER.info(process_open_source_totals.__doc__.strip()) + + # Filter for non-CC licenses + open_source_data = count_data[~count_data["NORMALIZED_LICENSE"].str.contains("CC", na=False)] + + data = open_source_data.groupby(["NORMALIZED_LICENSE"], as_index=False)["COUNT"].sum() + data = data.sort_values("COUNT", ascending=False) + data.reset_index(drop=True, inplace=True) + data.rename( + columns={ + "NORMALIZED_LICENSE": "License", + "COUNT": "Count", + }, + inplace=True, + ) + file_path = shared.path_join( + PATHS["data_phase"], "ia_open_source_totals.csv" + ) + data_to_csv(args, data, file_path) + + +def main(): + """ + Main function to process IA data. + """ + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + # Count data + file1_count = shared.path_join(PATHS["data_1-fetch"], "ia_1_count.csv") + if not os.path.exists(file1_count): + LOGGER.error(f"Count data file not found: {file1_count}") + return + + count_data = pd.read_csv(file1_count, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "COUNT"]) + + # Process various aggregations + process_license_totals(args, count_data) + process_cc_license_totals(args, count_data) + process_latest_prior_retired_totals(args, count_data) + process_totals_by_free_cultural(args, count_data) + process_totals_by_restrictions(args, count_data) + process_open_source_totals(args, count_data) + process_media_type_totals(args, count_data) + + # Language data + file2_language = shared.path_join( + PATHS["data_1-fetch"], "ia_2_count_by_language.csv" + ) + if os.path.exists(file2_language): + language_data = pd.read_csv( + file2_language, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "LANGUAGE", "COUNT"] + ) + process_totals_by_language(args, language_data) + + # Country data + file3_country = shared.path_join( + PATHS["data_1-fetch"], "ia_3_count_by_country.csv" + ) + if os.path.exists(file3_country): + country_data = pd.read_csv( + file3_country, usecols=["LICENSE_URL", "NORMALIZED_LICENSE", "COUNTRY", "COUNT"] + ) + process_totals_by_country(args, country_data) + + # Git operations + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit processed Internet Archive (IA) data for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) + diff --git a/scripts/3-report/internetarchive_report.py b/scripts/3-report/internetarchive_report.py new file mode 100644 index 00000000..f0d2b611 --- /dev/null +++ b/scripts/3-report/internetarchive_report.py @@ -0,0 +1,647 @@ +#!/usr/bin/env python +""" +This file is dedicated to visualizing and analyzing the data collected +from Internet Archive (IA). +""" +# Standard library +import argparse +import os +import sys +import textwrap +import traceback + +# Third-party +import pandas as pd +from pygments import highlight +from pygments.formatters import TerminalFormatter +from pygments.lexers import PythonTracebackLexer + +# Add parent directory so shared can be imported +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +# First-party/Local +import plot # noqa: E402 +import shared # noqa: E402 + +# Setup +LOGGER, PATHS = shared.setup(__file__) + +# Constants +QUARTER = os.path.basename(PATHS["data_quarter"]) +SECTION = "Internet Archive (IA)" + + +def parse_arguments(): + """ + Parses command-line arguments, returns parsed arguments. + """ + LOGGER.info("Parsing command-line arguments") + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--quarter", + default=QUARTER, + help=f"Data quarter in format YYYYQx (default: {QUARTER})", + ) + parser.add_argument( + "--show-plots", + action="store_true", + help="Show generated plots (default: False)", + ) + parser.add_argument( + "--enable-save", + action="store_true", + help="Enable saving results (default: False)", + ) + parser.add_argument( + "--enable-git", + action="store_true", + help="Enable git actions such as fetch, merge, add, commit, and push" + " (default: False)", + ) + args = parser.parse_args() + if not args.enable_save and args.enable_git: + parser.error("--enable-git requires --enable-save") + if args.quarter != QUARTER: + global PATHS + PATHS = shared.paths_update(LOGGER, PATHS, QUARTER, args.quarter) + args.logger = LOGGER + args.paths = PATHS + return args + + +def ia_intro(args): + """ + Write Internet Archive (IA) introduction. + """ + LOGGER.info(ia_intro.__doc__.strip()) + + # Try to get total count from license totals + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_license_totals.csv", + ) + + if os.path.exists(file_path): + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + data = pd.read_csv(file_path) + total_count = f"{data['COUNT'].sum():,d}" + else: + total_count = "N/A" + + shared.update_readme( + args, + SECTION, + "Overview", + None, + None, + "Internet Archive (IA) data uses the Advanced Search API to query for" + " items with Creative Commons and open source licenses. The data includes" + " license information, language, country, and media type metadata.\n" + "\n" + f"**The results indicate there are a total of {total_count} items in the" + " Internet Archive that are licensed or in the public domain using" + " Creative Commons or open source legal tools.**\n" + "\n" + "Thank you Internet Archive for providing access to this valuable" + " cultural heritage data!\n", + ) + + +def plot_cc_products(args): + """ + Create plots for CC legal tool product totals and percentages + """ + LOGGER.info(plot_cc_products.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "ia_cc_product_totals.csv" + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool product" + data = pd.read_csv(file_path, index_col=name_label) + data = data[::-1] # reverse order + + title = "CC Products totals and percentages" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + bar_ylabel=name_label, + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_product_totals.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool product totals and" + " percentages from Internet Archive.", + ) + + +def plot_cc_tool_status(args): + """ + Create plots for the CC legal tool status totals and percentages + """ + LOGGER.info(plot_cc_tool_status.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_cc_status_combined_totals.csv", + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + + # Check if data is empty + if data.empty or data['Count'].sum() == 0: + LOGGER.warning(f"No data found in {file_path}, skipping plot") + return + + data.sort_values(name_label, ascending=False, inplace=True) + + title = "CC legal tools status" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + bar_ylabel="CC legal tool status", + ) + + image_path = shared.path_join(PATHS["data_phase"], "ia_cc_tool_status.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Creative Commons (CC) legal tool status totals and" + " percentages from Internet Archive.", + ) + + +def plot_latest_tools(args): + """ + Create plots for latest CC legal tool totals and percentages + """ + LOGGER.info(plot_latest_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_cc_status_latest_totals.csv", + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + + # Check if data is empty + if data.empty or data['Count'].sum() == 0: + LOGGER.warning(f"No data found in {file_path}, skipping plot") + return + + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Latest CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_status_latest_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing latest Creative Commons (CC) legal tool totals and" + " percentages from Internet Archive.", + ) + + +def plot_prior_tools(args): + """ + Create plots for prior CC legal tool totals and percentages + """ + LOGGER.info(plot_prior_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "ia_cc_status_prior_totals.csv" + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + + # Check if data is empty + if data.empty or data['Count'].sum() == 0: + LOGGER.warning(f"No data found in {file_path}, skipping plot") + return + + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Prior CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_status_prior_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing prior Creative Commons (CC) legal tool totals and" + " percentages from Internet Archive.", + "The unit names have been normalized (~~`CC BY-ND-NC`~~ =>" + " `CC BY-NC-ND`).", + ) + + +def plot_retired_tools(args): + """ + Create plots for retired CC legal tool totals and percentages + """ + LOGGER.info(plot_retired_tools.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_cc_status_retired_totals.csv", + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "CC legal tool" + data = pd.read_csv(file_path, index_col=name_label) + + # Check if data is empty + if data.empty or data['Count'].sum() == 0: + LOGGER.warning(f"No data found in {file_path}, skipping plot") + return + + data.sort_values(name_label, ascending=False, inplace=True) + + title = "Retired CC legal tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_status_retired_tools.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing retired Creative Commons (CC) legal tools total and" + " percentages from Internet Archive.", + "For more information on retired legal tools, see [Retired Legal Tools" + " - Creative Commons](https://creativecommons.org/retiredlicenses/).", + ) + + +def plot_countries_highest_usage(args): + """ + Create plots for the countries with highest usage of latest tools + """ + LOGGER.info(plot_countries_highest_usage.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "ia_cc_totals_by_country.csv" + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Country" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + total_count = f"{data['Count'].sum():,d}" + data.sort_values(data_label, ascending=False, inplace=True) + data = data[:10] # limit to highest 10 + data = data[::-1] # reverse order + + title = "Countries with highest usage of CC tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + bar_xscale="log", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_countries_highest_usage.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing countries with the highest usage of Creative Commons" + " (CC) legal tools from Internet Archive.", + f"The complete data set indicates there are a total of {total_count}" + " items using CC legal tools in the Internet Archive.", + ) + + +def plot_languages_highest_usage(args): + """ + Create plots for the languages with highest usage of CC tools + """ + LOGGER.info(plot_languages_highest_usage.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], "ia_cc_totals_by_language.csv" + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Language" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + total_count = f"{data['Count'].sum():,d}" + data.sort_values(data_label, ascending=False, inplace=True) + data = data[:10] # limit to highest 10 + data = data[::-1] # reverse order + + title = "Languages with highest usage of CC tools" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + bar_xscale="log", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_cc_languages_highest_usage.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing languages with the highest usage of Creative Commons" + " (CC) legal tools from Internet Archive.", + f"The complete data set indicates there are a total of {total_count}" + " items using CC legal tools in the Internet Archive.", + ) + + +def plot_free_culture(args): + """ + Create plots for Approved for Free Cultural Works + """ + LOGGER.info(plot_free_culture.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_cc_totals_by_free_cultural.csv", + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "Category" + data_label = "Count" + data = pd.read_csv(file_path, index_col=name_label) + + title = "Approved for Free Cultural Works" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label=data_label, + ) + + image_path = shared.path_join(PATHS["data_phase"], "ia_cc_free_culture.png") + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing Approved for Free Cultural Works legal tool usage" + " from Internet Archive.", + "[Understanding Free Cultural Works - Creative" + " Commons](https://creativecommons.org/public-domain/freeworks/):\n" + "\n" + '> Using [the Freedom Defined definition of a "Free Cultural Work"],' + " material licensed under CC BY or BY-SA is a free cultural work. (So" + " is anything in the worldwide public domain marked with CC0 or the" + " Public Domain Mark.) CC's other licenses– BY-NC, BY-ND, BY-NC-SA," + " and BY-NC-ND–only allow more limited uses, and material under these" + " licenses is not considered a free cultural work.", + ) + + +def plot_open_source_licenses(args): + """ + Create plots for open source licenses (non-CC) + """ + LOGGER.info(plot_open_source_licenses.__doc__.strip()) + file_path = shared.path_join( + PATHS["data_2-process"], + "ia_open_source_totals.csv", + ) + + if not os.path.exists(file_path): + LOGGER.warning(f"Data file not found: {file_path}") + return + + LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}") + name_label = "License" + data = pd.read_csv(file_path, index_col=name_label) + data.sort_values("Count", ascending=False, inplace=True) + data = data[:10] # limit to top 10 + data = data[::-1] # reverse order + + title = "Open Source Licenses (Non-CC)" + plt = plot.combined_plot( + args=args, + data=data, + title=title, + name_label=name_label, + data_label="Count", + bar_xscale="log", + ) + + image_path = shared.path_join( + PATHS["data_phase"], "ia_open_source_licenses.png" + ) + LOGGER.info(f"image file: {image_path.replace(PATHS['repo'], '.')}") + + if args.enable_save: + # Create the directory if it does not exist + os.makedirs(PATHS["data_phase"], exist_ok=True) + plt.savefig(image_path) + + shared.update_readme( + args, + SECTION, + title, + image_path, + "Plots showing open source license usage from Internet Archive" + " (excluding Creative Commons licenses).", + ) + + +def main(): + """ + Main function to generate IA reports. + """ + args = parse_arguments() + shared.paths_log(LOGGER, PATHS) + shared.git_fetch_and_merge(args, PATHS["repo"]) + + ia_intro(args) + plot_cc_products(args) + plot_cc_tool_status(args) + plot_latest_tools(args) + plot_prior_tools(args) + plot_retired_tools(args) + plot_countries_highest_usage(args) + plot_languages_highest_usage(args) + plot_free_culture(args) + plot_open_source_licenses(args) + + args = shared.git_add_and_commit( + args, + PATHS["repo"], + PATHS["data_quarter"], + f"Add and commit Internet Archive (IA) reports for {QUARTER}", + ) + shared.git_push_changes(args, PATHS["repo"]) + + +if __name__ == "__main__": + try: + main() + except shared.QuantifyingException as e: + if e.exit_code == 0: + LOGGER.info(e.message) + else: + LOGGER.error(e.message) + sys.exit(e.exit_code) + except SystemExit as e: + if e.code != 0: + LOGGER.error(f"System exit with code: {e.code}") + sys.exit(e.code) + except KeyboardInterrupt: + LOGGER.info("(130) Halted via KeyboardInterrupt.") + sys.exit(130) + except Exception: + traceback_formatted = textwrap.indent( + highlight( + traceback.format_exc(), + PythonTracebackLexer(), + TerminalFormatter(), + ), + " ", + ) + LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") + sys.exit(1) diff --git a/test_ia_pipeline.py b/test_ia_pipeline.py new file mode 100644 index 00000000..f9b17d68 --- /dev/null +++ b/test_ia_pipeline.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +""" +Test script for the Internet Archive (IA) pipeline. +This script tests the complete IA data collection, processing, and reporting pipeline. +""" +import os +import sys +import subprocess +import tempfile +import shutil + +def run_command(cmd, cwd=None): + """Run a command and return the result.""" + print(f"Running: {cmd}") + result = subprocess.run(cmd, shell=True, cwd=cwd, capture_output=True, text=True) + if result.returncode != 0: + print(f"Error running command: {cmd}") + print(f"STDOUT: {result.stdout}") + print(f"STDERR: {result.stderr}") + return result + +def test_ia_pipeline(): + """Test the complete IA pipeline.""" + print("Testing Internet Archive (IA) Pipeline") + print("=" * 50) + + # Get the current directory (should be the project root) + project_root = os.getcwd() + print(f"Project root: {project_root}") + + # Test 1: Check if all required files exist + print("\n1. Checking required files...") + required_files = [ + "scripts/1-fetch/internetarchive_fetch.py", + "scripts/2-process/internetarchive_process.py", + "scripts/3-report/internetarchive_report.py", + "data/ia_license_mapping.csv" + ] + + for file_path in required_files: + full_path = os.path.join(project_root, file_path) + if os.path.exists(full_path): + print(f"[OK] {file_path}") + else: + print(f"[FAIL] {file_path} - MISSING") + return False + + # Test 2: Test fetch script with development mode + print("\n2. Testing fetch script (development mode)...") + fetch_cmd = "pipenv run python scripts/1-fetch/internetarchive_fetch.py --dev --enable-save --limit 10 --max-items 50" + result = run_command(fetch_cmd, cwd=project_root) + + if result.returncode != 0: + print("[FAIL] Fetch script failed") + return False + else: + print("[OK] Fetch script completed successfully") + + # Test 3: Check if fetch data files were created + print("\n3. Checking fetch data files...") + data_files = [ + "data/2025Q4/1-fetch/ia_1_count.csv", + "data/2025Q4/1-fetch/ia_2_count_by_language.csv", + "data/2025Q4/1-fetch/ia_3_count_by_country.csv" + ] + + for file_path in data_files: + full_path = os.path.join(project_root, file_path) + if os.path.exists(full_path): + print(f"[OK] {file_path}") + # Check if file has content + with open(full_path, 'r') as f: + lines = f.readlines() + if len(lines) > 1: # More than just header + print(f" - Contains {len(lines)-1} data rows") + else: + print(f" - Warning: Only header row found") + else: + print(f"[FAIL] {file_path} - NOT CREATED") + + # Test 4: Test process script + print("\n4. Testing process script...") + process_cmd = "pipenv run python scripts/2-process/internetarchive_process.py --enable-save" + result = run_command(process_cmd, cwd=project_root) + + if result.returncode != 0: + print("[FAIL] Process script failed") + return False + else: + print("[OK] Process script completed successfully") + + # Test 5: Check if processed data files were created + print("\n5. Checking processed data files...") + processed_files = [ + "data/2025Q4/2-process/ia_license_totals.csv", + "data/2025Q4/2-process/ia_cc_product_totals.csv", + "data/2025Q4/2-process/ia_cc_status_combined_totals.csv", + "data/2025Q4/2-process/ia_cc_status_latest_totals.csv", + "data/2025Q4/2-process/ia_cc_status_prior_totals.csv", + "data/2025Q4/2-process/ia_cc_status_retired_totals.csv", + "data/2025Q4/2-process/ia_cc_totals_by_free_cultural.csv", + "data/2025Q4/2-process/ia_cc_totals_by_restrictions.csv", + "data/2025Q4/2-process/ia_open_source_totals.csv" + ] + + for file_path in processed_files: + full_path = os.path.join(project_root, file_path) + if os.path.exists(full_path): + print(f"[OK] {file_path}") + else: + print(f"[FAIL] {file_path} - NOT CREATED") + + # Test 6: Test report script + print("\n6. Testing report script...") + report_cmd = "pipenv run python scripts/3-report/internetarchive_report.py --enable-save" + result = run_command(report_cmd, cwd=project_root) + + if result.returncode != 0: + print("[FAIL] Report script failed") + return False + else: + print("[OK] Report script completed successfully") + + # Test 7: Check if report files were created + print("\n7. Checking report files...") + report_files = [ + "data/2025Q4/3-report/ia_cc_product_totals.png", + "data/2025Q4/3-report/ia_cc_tool_status.png", + "data/2025Q4/3-report/ia_cc_status_latest_tools.png", + "data/2025Q4/3-report/ia_cc_status_prior_tools.png", + # "data/2025Q4/3-report/ia_cc_status_retired_tools.png", # May not exist if no retired tools data + "data/2025Q4/3-report/ia_cc_countries_highest_usage.png", + "data/2025Q4/3-report/ia_cc_languages_highest_usage.png", + "data/2025Q4/3-report/ia_cc_free_culture.png", + "data/2025Q4/3-report/ia_open_source_licenses.png" + ] + + for file_path in report_files: + full_path = os.path.join(project_root, file_path) + if os.path.exists(full_path): + print(f"[OK] {file_path}") + else: + print(f"[FAIL] {file_path} - NOT CREATED") + + # Test 8: Check if README was updated + print("\n8. Checking README update...") + readme_path = os.path.join(project_root, "data/2025Q4/README.md") + if os.path.exists(readme_path): + print(f"[OK] README.md created") + with open(readme_path, 'r') as f: + content = f.read() + if "Internet Archive (IA)" in content: + print("[OK] Internet Archive section found in README") + else: + print("[FAIL] Internet Archive section not found in README") + else: + print("[FAIL] README.md not created") + + print("\n" + "=" * 50) + print("IA Pipeline Test Complete!") + print("=" * 50) + + return True + +if __name__ == "__main__": + success = test_ia_pipeline() + if success: + print("\n[OK] All tests passed!") + sys.exit(0) + else: + print("\n[FAIL] Some tests failed!") + sys.exit(1)