Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add hathifiles poll #39

Merged
merged 7 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,6 @@ docs/_build
bin/digifeeds/*.config
.config/rclone/rclone.conf
.config/rclone/*.json

tmp/*
!tmp/.keep
19 changes: 10 additions & 9 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"[python]": {
"editor.formatOnSave": true,
"editor.defaultFormatter": "charliermarsh.ruff"
},
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true,
"[python]": {
"editor.formatOnSave": true,
"editor.defaultFormatter": "charliermarsh.ruff"
},
"editor.detectIndentation": false,
}
26 changes: 26 additions & 0 deletions aim/cli/hathifiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import typer
from aim.hathifiles import poll
from aim.services import S

app = typer.Typer()


@app.command()
def create_store_file():
f"""
Genereates a new store file at {S.hathifiles_store_path} if one does not
already exist. The new file is based on the latest hathi_files_list.json
from hathitrust.org
"""
poll.create_store_file()


@app.command()
def check_for_new_update_files():
"""
Pulls the latest hathi_files_list.json from hathitrust.org and checks if
there are any update files that aren't in the store. If there are new files
it notifies the argo events webhook and replaces the store file with the old
files and the new ones.
"""
poll.check_for_new_update_files()
28 changes: 24 additions & 4 deletions aim/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,32 @@
"""

import typer
import aim.cli.digifeeds as digifeeds
from aim.services import S


def should_load(app_name: str):
return S.app_name == "aim" or S.app_name == app_name


app = typer.Typer()
app.add_typer(
digifeeds.app, name="digifeeds", help="Commands related to the digifeeds process"
)
if should_load("digifeeds"):
import aim.cli.digifeeds as digifeeds

app.add_typer(
digifeeds.app,
name="digifeeds",
help="Commands related to the digifeeds process",
)


if should_load("hathifiles"):
import aim.cli.hathifiles as hathifiles

app.add_typer(
hathifiles.app,
name="hathifiles",
help="Commands related to the hathifiles database",
)


if __name__ == "__main__": # pragma: no cover
Expand Down
Empty file added aim/hathifiles/__init__.py
Empty file.
163 changes: 163 additions & 0 deletions aim/hathifiles/poll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import requests
import json
import os
from datetime import datetime, timedelta
from typing import Type
from aim.services import S


def filter_for_update_files(hathi_file_list: list) -> list:
"""
Takes a plain hathifile_file_list list and filters to get only the file
names for update files

Args:
hathi_file_list (list): full list of current hathifiles from hathitrust.org

Returns:
list: flat list of update file names
"""
return [d["filename"] for d in hathi_file_list if not d["full"]]


def get_latest_update_files():
"""
Gets the latest list of current hathifiles from hathitrust.org and filters
for just a list of update files.

Returns:
list: flat list of update file names
"""
return filter_for_update_files(get_hathi_file_list())


def get_hathi_file_list() -> list:
"""
Gets the latest current list of hathifiles from hathitrust.org.

Returns:
list: list of dictionairies that describe hathifiles
"""
response = requests.get(
"https://www.hathitrust.org/files/hathifiles/hathi_file_list.json"
)
if response.status_code != 200:
response.raise_for_status()
return response.json()


def get_store(store_path: str = S.hathifiles_store_path) -> list:
"""
Loads the store file that contains the list of all hathifile update files
that have been seen before.

Args:
store_path (str, optional): path to the store file. Defaults to S.hathifiles_store_path.

Returns:
list: list of hathifile update files that have been seen before
"""
with open(store_path) as f:
file_list = json.load(f)
return file_list


def create_store_file(store_path: str = S.hathifiles_store_path) -> None:
"""
Creates a store file of the current list of update files from hathitrust.org
if there does not already exist a store file.

Args:
store_path (str, optional): path to store file. Defaults to S.hathifiles_store_path.
"""

if os.path.exists(store_path):
S.logger.info("HathiFiles store file already exists. Leaving alone.")
else:
update_files_list = get_latest_update_files()
with open(store_path, "w") as f:
json.dump(update_files_list, f, ensure_ascii=False, indent=4)
S.logger.info("Created Hathifiles store file")


class NewFileHandler:
def __init__(self, new_files: list, store: list) -> None:
self.new_files = new_files
self.store = store

def notify_webhook(self):
"""
Sends a list of update files that haven't been seen to the argo events
webhook for hathifiles.
"""
response = requests.post(
S.hathifiles_webhook_url, json={"file_names": self.new_files}
)
if response.status_code == 200:
S.logger.info("Notify webhook SUCCESS")
else:
response.raise_for_status()

@property
def slim_store(self):
"""
Removes files from the store that are over one year old

Returns:
list: list of update files that are newer than one year
"""
last_year = datetime.today() - timedelta(days=365)
slimmed_store = []
for file_name in self.store:
end = file_name.split("_")[2]
date = datetime.strptime(end.split(".")[0], "%Y%m%d")
if date > last_year:
slimmed_store.append(file_name)
return slimmed_store

def replace_store(self, store_path: str = S.hathifiles_store_path):
"""
Replaces the store file with a list of hathifile update files

Args:
store_path (str, optional): path to hathifiles store file. Defaults to S.hathifiles_store_path.
"""
with open(store_path, "w") as f:
json.dump(
(self.slim_store + self.new_files), f, ensure_ascii=False, indent=4
)

S.logger.info("Update store SUCCESS")


def check_for_new_update_files(
latest_update_files: list | None = None,
store: list | None = None,
new_file_handler_klass: Type[NewFileHandler] = NewFileHandler,
):
"""
Gets the latest list of hathifiles from hathitrust.org, loads up the store
file and compares them. If there are new files triggers the argo events
webhook and updates the store. If there are no new files, it exits.

Args:
latest_update_files (list | None, optional): list of latest update files. This will call get_latest_update_files() when None is given.
store (list | None, optional): list of hathifiles update files that have been seen before. This will call get_store() if None is given.
new_file_handler_klass (Type[NewFileHandler], optional): Class that handles new update files. Defaults to NewFileHandler.
"""
if latest_update_files is None: # pragma: no cover
latest_update_files = get_latest_update_files()

if store is None: # pragma: no cover
store = get_store()

new_files = [filename for filename in latest_update_files if filename not in store]

if not new_files:
S.logger.info("No new Hathifiles update files")
else:
S.logger.info("New Hathifiles update file(s)", file_names=",".join(new_files))

handler = new_file_handler_klass(new_files=new_files, store=store)
handler.notify_webhook()
handler.replace_store()
22 changes: 18 additions & 4 deletions aim/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class Services:
Global Configuration Services
"""

#: The application name
app_name: str

#: The structured logger
logger: structlog.stdlib.BoundLogger

Expand Down Expand Up @@ -89,15 +92,22 @@ class Services:
#: The name of the rclone remote where reports from digifeeds are sent
digifeeds_reports_rclone_remote: str

#: file path to store of the hathi_file_list update items
hathifiles_store_path: str

#: url to argo events webhook for triggering the update of the hathifiles database
hathifiles_webhook_url: str


S = Services(
app_name=os.getenv("APP_NAME") or "aim",
logger=structlog.get_logger(),
mysql_database=sa.engine.URL.create(
drivername="mysql+mysqldb",
username=os.environ["MARIADB_USER"],
password=os.environ["MARIADB_PASSWORD"],
host=os.environ["DATABASE_HOST"],
database=os.environ["MARIADB_DATABASE"],
username=os.getenv("MARIADB_USER") or "user",
password=os.getenv("MARIADB_PASSWORD") or "password",
host=os.getenv("DATABASE_HOST") or "database",
database=os.getenv("MARIADB_DATABASE") or "database",
),
test_database="sqlite:///:memory:",
ci_on=os.getenv("CI"),
Expand All @@ -121,4 +131,8 @@ class Services:
or "digifeeds_pickup",
digifeeds_reports_rclone_remote=os.getenv("DIGIFEEDS_REPORTS_RCLONE_REMOTE")
or "digifeeds_reports",
hathifiles_store_path=os.getenv("HATHIFILES_STORE_PATH")
or "tmp/hathi_file_list_store.json",
hathifiles_webhook_url=os.getenv("HATHIFILES_WEBHOOK_URL")
or "http://localhost:1200/new_hathifile",
)
7 changes: 7 additions & 0 deletions docs/api/aim.cli.hathifiles.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
aim.cli.hathifiles module
=========================

.. automodule:: aim.cli.hathifiles
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/api/aim.cli.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ Submodules
:maxdepth: 4

aim.cli.digifeeds
aim.cli.hathifiles
aim.cli.main
7 changes: 7 additions & 0 deletions docs/api/aim.hathifiles.poll.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
aim.hathifiles.poll module
==========================

.. automodule:: aim.hathifiles.poll
:members:
:undoc-members:
:show-inheritance:
15 changes: 15 additions & 0 deletions docs/api/aim.hathifiles.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
aim.hathifiles package
======================

.. automodule:: aim.hathifiles
:members:
:undoc-members:
:show-inheritance:

Submodules
----------

.. toctree::
:maxdepth: 4

aim.hathifiles.poll
1 change: 1 addition & 0 deletions docs/api/aim.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Subpackages

aim.cli
aim.digifeeds
aim.hathifiles

Submodules
----------
Expand Down
23 changes: 23 additions & 0 deletions tests/cli/test_hathifiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typer.testing import CliRunner
from aim.cli.main import app
from aim.hathifiles import poll

runner = CliRunner()


def test_hathifiles_create_store_file(mocker):
create_store_file_mock = mocker.patch.object(poll, "create_store_file")

result = runner.invoke(app, ["hathifiles", "create-store-file"])

assert result.exit_code == 0
assert create_store_file_mock.call_count == 1


def test_hathifiles_check_for_new_update_files(mocker):
create_files_mock = mocker.patch.object(poll, "check_for_new_update_files")

result = runner.invoke(app, ["hathifiles", "check-for-new-update-files"])

assert result.exit_code == 0
assert create_files_mock.call_count == 1
Loading
Loading