r-three · nkandpa2 · Apr 15, 2024 · May 13, 2024 · May 13, 2024 · May 13, 2024
diff --git a/usgpo/README.md b/usgpo/README.md
@@ -0,0 +1,8 @@
+# USGPO
+
+Government documents published by the [US Government Publishing Office](http://www.gpo.gov). Since each of these documents are authored by the US Federal Government, they are in the Public Domain.
+
+# Data Collection
+
+To collect the documents, run the script `usgpo/get-data.sh` from the repo's top-level directory. Internally, this will run `get-links.py` to get a collection of links to the government documents and `download-files.py` to download each link and parse out the relevant text. This command will save the final dataset in `data/usgpo/v0`.
+
diff --git a/usgpo/download-files.py b/usgpo/download-files.py
@@ -0,0 +1,118 @@
+import argparse
+import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import jsonlines
+import trafilatura
+from bs4 import BeautifulSoup
+from tqdm.auto import tqdm
+from utils import api_query
+
+from licensed_pile import logs
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "usgpo"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--api-key", required=True, help="GovInfo API key")
+    parser.add_argument(
+        "--links-file", required=True, help="Path to links file (jsonl)"
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=f"data/{SOURCE_NAME}/v0",
+        help="Path to output directory",
+    )
+    parser.add_argument(
+        "--filename",
+        default="usgpo.jsonl.gz",
+        help="The base filename for the USGPO Dolma dataset",
+    )
+    parser.add_argument(
+        "--shard-size", type=int, default=1, help="Size, in GB, for each shard"
+    )
+    parser.add_argument("--workers", type=int, default=10, help="Number of threads")
+    args = parser.parse_args()
+    return args
+
+
+def download_file(api_key, file_url):
+    response = api_query(file_url, headers=None, params={"api_key": api_key})
+    text = response.text
+    return text
+
+
+def parse_html(html):
+    # Most documents are pre-formatted text inside of the a <pre> tag
+    # For the rest of the documents, we use trafilatura to extract to markdown
+    soup = BeautifulSoup(html, "html.parser")
+    pre_tag = soup.find("pre")
+    if pre_tag:
+        text = pre_tag.get_text()
+    else:
+        text = trafilatura.extract(html, output_format="markdown")
+    return text
+
+
+def construct_record(api_key, file):
+    logger = logs.get_logger("usgpo")
+    try:
+        links = file.get("links")
+        if links is None:
+            return None
+
+        file_url = links.get("txtLink")
+        # Occassionally there will be multiple txtLinks pointing to the same URL. Just take the first.
+        if isinstance(file_url, list):
+            file_url = file_url[0]
+
+        if file_url is None:
+            return None
+
+        html = download_file(api_key, file_url)
+        text = parse_html(html)
+
+        if text is None or len(text) == 0:
+            return None
+
+        return {
+            "id": file["package_id"],
+            "title": file["title"],
+            "date": file["date"],
+            "author": file["author"],
+            "publisher": file["publisher"],
+            "category": file["category"],
+            "text": text,
+            "source": SOURCE_NAME,
+            "added": datetime.datetime.utcnow().isoformat(),
+            "metadata": {"license": str(PermissiveLicenses.PD), "url": file_url},
+        }
+
+    except Exception as e:
+        logger.error(f"Failed to download package {file['package_id']}: {e}")
+        return None
+
+
+def generate_records(args):
+    with jsonlines.open(args.links_file, mode="r") as reader:
+        with ThreadPoolExecutor(max_workers=args.workers) as executor:
+            futures = [
+                executor.submit(construct_record, args.api_key, file) for file in reader
+            ]
+            for future in as_completed(futures):
+                record = future.result()
+                if record is not None:
+                    yield record
+
+
+def main(args):
+    to_dolma(generate_records(args), args.output_dir, args.filename, args.shard_size)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    logs.configure_logging("usgpo")
+    main(args)
diff --git a/usgpo/get-data.sh b/usgpo/get-data.sh
@@ -0,0 +1,14 @@
+set -e
+
+api_key=${1}
+start_date=${2:-"1990-01-01'T'00:00:00'Z'"}
+
+USGPO_DIRECTORY="data/usgpo"
+
+mkdir -p ${USGPO_DIRECTORY}/raw
+
+echo "Getting Document Links"
+python get-links.py --api-key "${api_key}" --start-date "${start_date}" --output-dir ${USGPO_DIRECTORY}/raw
+
+echo "Downloading Documents"
+python download-files.py --api-key ${api_key} --links-file ${USGPO_DIRECTORY}/raw/links.jsonl
diff --git a/usgpo/get-links.py b/usgpo/get-links.py
@@ -0,0 +1,143 @@
+import argparse
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import jsonlines
+from tqdm.auto import tqdm
+from utils import api_query
+
+from licensed_pile import logs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--api-key", required=True, help="GovInfo API key")
+    parser.add_argument(
+        "--start-date",
+        required=True,
+        help="Start date in ISO8601 format (yyyy-MM-dd'T'HH:mm:ss'Z')",
+    )
+    parser.add_argument("--output-dir", required=True, help="Path to output directory")
+    parser.add_argument("--workers", type=int, default=20, help="Number of threads")
+    parser.add_argument(
+        "--collections",
+        nargs="+",
+        default=[
+            "BILLS",
+            "BUDGET",
+            "CDIR",
+            "CFR",
+            "CPD",
+            "CRI",
+            "CZIC",
+            "GAOREPORTS",
+            "GPO",
+            "HJOURNAL",
+            "HOB",
+            "PAI",
+            "PLAW",
+            "USCODE",
+        ],
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_packages(api_key, collections, start_date):
+    logger = logs.get_logger("usgpo")
+
+    url = f"https://api.govinfo.gov/published/{start_date}"
+    offset_mark = "*"
+    packages = []
+    pbar = tqdm()
+    while url is not None:
+        response = api_query(
+            url,
+            headers={"accept": "application/json"},
+            params={
+                "api_key": args.api_key,
+                "offsetMark": offset_mark,
+                "pageSize": 1000,
+                "collection": ",".join(collections),
+            },
+        )
+        if response.status_code == 200:
+            output = response.json()
+
+            for record in output["packages"]:
+                packages.append(record)
+                pbar.update(1)
+
+            url = output["nextPage"]
+            offset_mark = None
+            # Sleep since a sudden burst of requests seems to result in erroneous rate-limiting
+            time.sleep(5)
+        else:
+            logger.error(
+                f"get_packages received status code {response.status_code} for query {url}"
+            )
+            break
+    return packages
+
+
+def get_file_links(api_key, package):
+    package_id = package["packageId"]
+    response = api_query(
+        f"https://api.govinfo.gov/packages/{package_id}/summary",
+        headers={"accept": "application/json"},
+        params={"api_key": args.api_key},
+    )
+    if response.status_code == 200:
+        output = response.json()
+        return output.get("download")
+    return None
+
+
+def get_package_metadata(api_key, package):
+    record = {
+        "title": package.get("title"),
+        "package_id": package.get("packageId"),
+        "date": package.get("dateIssued"),
+        "category": package.get("category"),
+        "author": package.get("governmentAuthor1"),
+        "publisher": package.get("publisher"),
+        "links": get_file_links(api_key, package),
+    }
+    return record
+
+
+def main(args):
+    logger = logs.get_logger("usgpo")
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Get packages from the specified USGPO collections from `args.start_date` to current day
+    logger.info(f"Getting packages from the following collections: {args.collections}")
+    packages = get_packages(args.api_key, args.collections, args.start_date)
+
+    logger.info(f"Getting package metadata and writing out to {args.output_dir}")
+    with jsonlines.open(
+        os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True
+    ) as writer:
+        # Spawn multiple worker threads to get the metadata associated with all packages
+        with ThreadPoolExecutor(max_workers=args.workers) as executor:
+            metadata_futures_to_package = {
+                executor.submit(get_package_metadata, args.api_key, package): package
+                for package in packages
+            }
+
+            # Write out package metadata to file
+            for metadata_future in tqdm(as_completed(metadata_futures_to_package)):
+                package = metadata_futures_to_package[metadata_future]
+                try:
+                    record = metadata_future.result()
+                except Exception as e:
+                    logger.error(f"Package {package} raised exception {e}")
+                    continue
+                writer.write(record)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    logs.configure_logging("usgpo")
+    main(args)
diff --git a/usgpo/utils.py b/usgpo/utils.py
@@ -0,0 +1,15 @@
+import requests
+import time
+
+from licensed_pile import logs
+
+
+def api_query(endpoint, headers, params):
+    logger = logs.get_logger("usgpo")
+    response = requests.get(endpoint, headers=headers, params=params)
+    if response.status_code == 429:
+        # Sleep for an hour if we've hit the rate-limit
+        logger.info("Exceeded rate-limit, sleeping for one hour")
+        time.sleep(60 * 60)
+        response = requests.get(endpoint, headers=headers, params=params)
+    return response