-
Notifications
You must be signed in to change notification settings - Fork 6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
USGPO #70
base: main
Are you sure you want to change the base?
USGPO #70
Changes from all commits
8ed6b61
509fe0d
78aced6
f06e1e8
b7def76
931ddd1
6e1e3ee
68e44b9
f14783b
8d46b15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# USGPO | ||
|
||
Government documents published by the [US Government Publishing Office](http://www.gpo.gov). Since each of these documents are authored by the US Federal Government, they are in the Public Domain. | ||
|
||
# Data Collection | ||
|
||
To collect the documents, run the script `usgpo/get-data.sh` from the repo's top-level directory. Internally, this will run `get-links.py` to get a collection of links to the government documents and `download-files.py` to download each link and parse out the relevant text. This command will save the final dataset in `data/usgpo/v0`. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import argparse | ||
import datetime | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
|
||
import jsonlines | ||
import trafilatura | ||
from bs4 import BeautifulSoup | ||
from tqdm.auto import tqdm | ||
from utils import api_query | ||
|
||
from licensed_pile import logs | ||
from licensed_pile.licenses import PermissiveLicenses | ||
from licensed_pile.write import to_dolma | ||
|
||
SOURCE_NAME = "usgpo" | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--api-key", required=True, help="GovInfo API key") | ||
parser.add_argument( | ||
"--links-file", required=True, help="Path to links file (jsonl)" | ||
) | ||
parser.add_argument( | ||
"--output-dir", | ||
default=f"data/{SOURCE_NAME}/v0", | ||
help="Path to output directory", | ||
) | ||
parser.add_argument( | ||
"--filename", | ||
default="usgpo.jsonl.gz", | ||
help="The base filename for the USGPO Dolma dataset", | ||
) | ||
parser.add_argument( | ||
"--shard-size", type=int, default=1, help="Size, in GB, for each shard" | ||
) | ||
parser.add_argument("--workers", type=int, default=10, help="Number of threads") | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def download_file(api_key, file_url): | ||
response = api_query(file_url, headers=None, params={"api_key": api_key}) | ||
text = response.text | ||
return text | ||
|
||
|
||
def parse_html(html): | ||
# Most documents are pre-formatted text inside of the a <pre> tag | ||
# For the rest of the documents, we use trafilatura to extract to markdown | ||
soup = BeautifulSoup(html, "html.parser") | ||
pre_tag = soup.find("pre") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. parsed_text = pre_tag.get_text() if (pre_tag := soup.find("pre")) else text |
||
if pre_tag: | ||
text = pre_tag.get_text() | ||
else: | ||
text = trafilatura.extract(html, output_format="markdown") | ||
return text | ||
|
||
|
||
def construct_record(api_key, file): | ||
logger = logs.get_logger("usgpo") | ||
try: | ||
links = file.get("links") | ||
if links is None: | ||
return None | ||
|
||
file_url = links.get("txtLink") | ||
# Occassionally there will be multiple txtLinks pointing to the same URL. Just take the first. | ||
if isinstance(file_url, list): | ||
file_url = file_url[0] | ||
|
||
if file_url is None: | ||
return None | ||
|
||
html = download_file(api_key, file_url) | ||
text = parse_html(html) | ||
|
||
if text is None or len(text) == 0: | ||
return None | ||
|
||
return { | ||
"id": file["package_id"], | ||
"title": file["title"], | ||
"date": file["date"], | ||
"author": file["author"], | ||
"publisher": file["publisher"], | ||
"category": file["category"], | ||
"text": text, | ||
"source": SOURCE_NAME, | ||
"added": datetime.datetime.utcnow().isoformat(), | ||
"metadata": {"license": str(PermissiveLicenses.PD), "url": file_url}, | ||
} | ||
|
||
except Exception as e: | ||
logger.error(f"Failed to download package {file['package_id']}: {e}") | ||
return None | ||
|
||
|
||
def generate_records(args): | ||
with jsonlines.open(args.links_file, mode="r") as reader: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a speed boost from using jsonlines? It seems simple enough to just call |
||
with ThreadPoolExecutor(max_workers=args.workers) as executor: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason to use the ThreadPoolExecutor over |
||
futures = [ | ||
executor.submit(construct_record, args.api_key, file) for file in reader | ||
] | ||
for future in as_completed(futures): | ||
record = future.result() | ||
if record is not None: | ||
yield record | ||
|
||
|
||
def main(args): | ||
to_dolma(generate_records(args), args.output_dir, args.filename, args.shard_size) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
logs.configure_logging("usgpo") | ||
main(args) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
set -e | ||
|
||
api_key=${1} | ||
start_date=${2:-"1990-01-01'T'00:00:00'Z'"} | ||
|
||
USGPO_DIRECTORY="data/usgpo" | ||
|
||
mkdir -p ${USGPO_DIRECTORY}/raw | ||
|
||
echo "Getting Document Links" | ||
python get-links.py --api-key "${api_key}" --start-date "${start_date}" --output-dir ${USGPO_DIRECTORY}/raw | ||
|
||
echo "Downloading Documents" | ||
python download-files.py --api-key ${api_key} --links-file ${USGPO_DIRECTORY}/raw/links.jsonl |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import argparse | ||
import os | ||
import time | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
|
||
import jsonlines | ||
from tqdm.auto import tqdm | ||
from utils import api_query | ||
|
||
from licensed_pile import logs | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--api-key", required=True, help="GovInfo API key") | ||
parser.add_argument( | ||
"--start-date", | ||
required=True, | ||
help="Start date in ISO8601 format (yyyy-MM-dd'T'HH:mm:ss'Z')", | ||
) | ||
parser.add_argument("--output-dir", required=True, help="Path to output directory") | ||
parser.add_argument("--workers", type=int, default=20, help="Number of threads") | ||
parser.add_argument( | ||
"--collections", | ||
nargs="+", | ||
default=[ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prefer a tuple for read only things like this. |
||
"BILLS", | ||
"BUDGET", | ||
"CDIR", | ||
"CFR", | ||
"CPD", | ||
"CRI", | ||
"CZIC", | ||
"GAOREPORTS", | ||
"GPO", | ||
"HJOURNAL", | ||
"HOB", | ||
"PAI", | ||
"PLAW", | ||
"USCODE", | ||
], | ||
) | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def get_packages(api_key, collections, start_date): | ||
logger = logs.get_logger("usgpo") | ||
|
||
url = f"https://api.govinfo.gov/published/{start_date}" | ||
offset_mark = "*" | ||
packages = [] | ||
pbar = tqdm() | ||
while url is not None: | ||
response = api_query( | ||
url, | ||
headers={"accept": "application/json"}, | ||
params={ | ||
"api_key": args.api_key, | ||
"offsetMark": offset_mark, | ||
"pageSize": 1000, | ||
"collection": ",".join(collections), | ||
}, | ||
) | ||
if response.status_code == 200: | ||
output = response.json() | ||
|
||
for record in output["packages"]: | ||
packages.append(record) | ||
pbar.update(1) | ||
|
||
url = output["nextPage"] | ||
offset_mark = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you document this |
||
# Sleep since a sudden burst of requests seems to result in erroneous rate-limiting | ||
time.sleep(5) | ||
else: | ||
logger.error( | ||
f"get_packages received status code {response.status_code} for query {url}" | ||
) | ||
break | ||
return packages | ||
|
||
|
||
def get_file_links(api_key, package): | ||
package_id = package["packageId"] | ||
response = api_query( | ||
f"https://api.govinfo.gov/packages/{package_id}/summary", | ||
headers={"accept": "application/json"}, | ||
params={"api_key": args.api_key}, | ||
) | ||
if response.status_code == 200: | ||
output = response.json() | ||
return output.get("download") | ||
return None | ||
|
||
|
||
def get_package_metadata(api_key, package): | ||
record = { | ||
"title": package.get("title"), | ||
"package_id": package.get("packageId"), | ||
"date": package.get("dateIssued"), | ||
"category": package.get("category"), | ||
"author": package.get("governmentAuthor1"), | ||
"publisher": package.get("publisher"), | ||
"links": get_file_links(api_key, package), | ||
} | ||
return record | ||
|
||
|
||
def main(args): | ||
logger = logs.get_logger("usgpo") | ||
os.makedirs(args.output_dir, exist_ok=True) | ||
|
||
# Get packages from the specified USGPO collections from `args.start_date` to current day | ||
logger.info(f"Getting packages from the following collections: {args.collections}") | ||
packages = get_packages(args.api_key, args.collections, args.start_date) | ||
|
||
logger.info(f"Getting package metadata and writing out to {args.output_dir}") | ||
with jsonlines.open( | ||
os.path.join(args.output_dir, "links.jsonl"), mode="w", flush=True | ||
) as writer: | ||
# Spawn multiple worker threads to get the metadata associated with all packages | ||
with ThreadPoolExecutor(max_workers=args.workers) as executor: | ||
metadata_futures_to_package = { | ||
executor.submit(get_package_metadata, args.api_key, package): package | ||
for package in packages | ||
} | ||
|
||
# Write out package metadata to file | ||
for metadata_future in tqdm(as_completed(metadata_futures_to_package)): | ||
package = metadata_futures_to_package[metadata_future] | ||
try: | ||
record = metadata_future.result() | ||
except Exception as e: | ||
logger.error(f"Package {package} raised exception {e}") | ||
continue | ||
writer.write(record) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
logs.configure_logging("usgpo") | ||
main(args) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import requests | ||
import time | ||
|
||
from licensed_pile import logs | ||
|
||
|
||
def api_query(endpoint, headers, params): | ||
logger = logs.get_logger("usgpo") | ||
response = requests.get(endpoint, headers=headers, params=params) | ||
if response.status_code == 429: | ||
# Sleep for an hour if we've hit the rate-limit | ||
logger.info("Exceeded rate-limit, sleeping for one hour") | ||
time.sleep(60 * 60) | ||
response = requests.get(endpoint, headers=headers, params=params) | ||
return response |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing doc string