Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 3 additions & 8 deletions muckrock/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# Third Party
import boto3
from botocore.exceptions import ClientError
from documentcloud import DocumentCloud
from smart_open.smart_open_lib import smart_open

# MuckRock
from muckrock.core.models import HomePage
from muckrock.core.utils import get_dc_client
from muckrock.message.email import TemplateEmail

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -120,13 +120,8 @@ def fetch_and_load_documentcloud_stats():
yesterday = (date.today() - timedelta(days=1)).strftime("%Y-%m-%d")
logger.info("Fetching DocumentCloud stats for %s", yesterday)

# Instantiate client with creds
client = DocumentCloud(
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
)
# Instantiate client
client = get_dc_client()

# Call statistics endpoint
resp = client.get(f"statistics?date={yesterday}")
Expand Down
24 changes: 23 additions & 1 deletion muckrock/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import boto3
import requests
import stripe
from documentcloud import DocumentCloud

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -203,7 +204,11 @@ def get_squarelet_access_token():
settings.SOCIAL_AUTH_SQUARELET_SECRET,
)
data = {"grant_type": "client_credentials"}
headers = {"X-Bypass-Rate-Limit": settings.BYPASS_RATE_LIMIT_SECRET}
existing_ua = requests.utils.default_headers()["User-Agent"]
headers = {
"X-Bypass-Rate-Limit": settings.BYPASS_RATE_LIMIT_SECRET,
"User-Agent": f"{existing_ua} {settings.SERVICE_USER_AGENT}",
}
logger.info(token_url)
resp = requests.post(
token_url,
Expand All @@ -226,9 +231,11 @@ def _squarelet(method, path, **kwargs):
"""Helper function for squarelet requests"""
api_url = "{}{}".format(settings.SQUARELET_URL, path)
access_token = get_squarelet_access_token()
existing_ua = requests.utils.default_headers()["User-Agent"]
headers = {
"Authorization": "Bearer {}".format(access_token),
"X-Bypass-Rate-Limit": settings.BYPASS_RATE_LIMIT_SECRET,
"User-Agent": f"{existing_ua} {settings.SERVICE_USER_AGENT}",
}
return method(api_url, headers=headers, **kwargs)

Expand Down Expand Up @@ -379,3 +386,18 @@ def mailchimp_journey(email, journey):
except (requests.ConnectionError, ValueError):
logger.error("[JOURNEY] Error starting journey", exc_info=sys.exc_info())
return response


def get_dc_client():
"""Get a DocumentCloud client for the MuckRock User Account"""
client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should rename these settings at some point. Doesn't have to be now.

password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
existing_ua = client.session.headers.get("User-Agent", "")
client.session.headers["User-Agent"] = (
f"{existing_ua} {settings.SERVICE_USER_AGENT}".strip()
)
return client
17 changes: 3 additions & 14 deletions muckrock/crowdsource/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,17 @@

# Django
from celery import shared_task
from django.conf import settings

# Standard Library
import csv
import logging

# Third Party
from documentcloud import DocumentCloud
from documentcloud.exceptions import DocumentCloudError

# MuckRock
from muckrock.core.tasks import AsyncFileDownloadTask
from muckrock.core.utils import get_dc_client
from muckrock.crowdsource.models import Crowdsource

logger = logging.getLogger(__name__)
Expand All @@ -31,12 +30,7 @@ def datum_per_page(crowdsource_pk, doc_id, metadata):
"""Create a crowdsource data item for each page of the document"""

crowdsource = Crowdsource.objects.get(pk=crowdsource_pk)
dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
document = dc_client.documents.get(doc_id)
for i in range(1, document.pages + 1):
crowdsource.data.create(
Expand All @@ -54,12 +48,7 @@ def import_doccloud_proj(crowdsource_pk, proj_id, metadata, doccloud_each_page):
"""Import documents from a document cloud project"""
crowdsource = Crowdsource.objects.get(pk=crowdsource_pk)

dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
project = dc_client.projects.get(proj_id)

for document in project.documents:
Expand Down
14 changes: 6 additions & 8 deletions muckrock/foia/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
from django.db.models.signals import post_delete, pre_save

# Third Party
from documentcloud import DocumentCloud
from documentcloud.exceptions import DoesNotExistError

# MuckRock
from muckrock.core.utils import clear_cloudfront_cache, get_s3_storage_bucket
from muckrock.core.utils import (
clear_cloudfront_cache,
get_dc_client,
get_s3_storage_bucket,
)
from muckrock.foia.models import FOIAFile, FOIARequest, OutboundRequestAttachment
from muckrock.foia.tasks import upload_document_cloud

Expand Down Expand Up @@ -47,12 +50,7 @@ def foia_file_delete_dc(sender, **kwargs):

foia_file = kwargs["instance"]
if foia_file.doc_id:
dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
try:
dc_client.documents.delete(foia_file.doc_id)
except DoesNotExistError:
Expand Down
51 changes: 17 additions & 34 deletions muckrock/foia/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
)
from muckrock.core.models import ExtractDay
from muckrock.core.tasks import AsyncFileDownloadTask
from muckrock.core.utils import read_in_chunks, squarelet_get
from muckrock.core.utils import get_dc_client, read_in_chunks, squarelet_get
from muckrock.foia.exceptions import SizeError
from muckrock.foia.models import (
FOIACommunication,
Expand Down Expand Up @@ -99,12 +99,7 @@ def upload_document_cloud(ffile_pk):
# if it has a doc_id already, we are changing it, not creating it
change = bool(ffile.doc_id)

dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()

_upload_documentcloud(
dc_client,
Expand Down Expand Up @@ -149,13 +144,19 @@ def upload_user_document_cloud(ffile_pk, user_pk):
"Error getting token for Add-On: %s", exc, exc_info=sys.exc_info()
)
raise

# Note this method doesn't use get_dc_client() from core/utils.py
# because it isn't using the MuckRock service Account
# rather uploading to the user's account instead
dc_client = DocumentCloud(
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
existing_ua = dc_client.session.headers.get("User-Agent", "")
dc_client.session.headers.update(
{"Authorization": "Bearer {}".format(resp.json()["access_token"])}
{
"Authorization": f"Bearer {resp.json()['access_token']}",
"User-Agent": f"{existing_ua} {settings.SERVICE_USER_AGENT}".strip(),
}
)
project, _created = dc_client.projects.get_or_create_by_title("MuckRock Imports")
params = {"project": project.id}
Expand Down Expand Up @@ -238,12 +239,7 @@ def set_document_cloud_pages(ffile_pk):
# already has pages set or not a doc cloud, just return
return

dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
document = dc_client.documents.get(ffile.doc_id)

if document.status == "success":
Expand Down Expand Up @@ -283,12 +279,7 @@ def noindex_documentcloud(foia_pk):
doc_ids = foia.get_files().exclude(doc_id="").values_list("doc_id", flat=True)
# get just the numeric ID
doc_ids = [d.split("-")[0] for d in doc_ids]
dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
for group in grouper(doc_ids, BULK_LIMIT):
resp = dc_client.patch(
"documents/",
Expand Down Expand Up @@ -360,12 +351,7 @@ def composer_delayed_submit(composer_pk, approve, contact_info, **kwargs):
def get_text_ocr(doc_id):
"""Get the text OCR from document cloud"""

dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()

try:
document = dc_client.documents.get(doc_id)
Expand Down Expand Up @@ -1236,12 +1222,7 @@ def import_doccloud_file(file_pk):
except FOIAFile.DoesNotExist:
return

dc_client = DocumentCloud(
username=settings.DOCUMENTCLOUD_BETA_USERNAME,
password=settings.DOCUMENTCLOUD_BETA_PASSWORD,
base_uri=f"{settings.DOCCLOUD_API_URL}/api/",
auth_uri=f"{settings.SQUARELET_URL}/api/",
)
dc_client = get_dc_client()
document = dc_client.documents.get(ffile.doc_id)

ext = ffile.get_extension()
Expand All @@ -1252,8 +1233,10 @@ def import_doccloud_file(file_pk):
ffile.save()

if document.access == "public":
ua = requests.utils.default_headers()["User-Agent"]
headers = {"User-Agent": f"{ua} {settings.SERVICE_USER_AGENT}"}
with ffile.ffile.open("wb") as out_file, requests.get(
document.pdf_url, stream=True, timeout=10
document.pdf_url, headers=headers, stream=True, timeout=10
) as response:
response.raise_for_status()
for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
Expand Down
3 changes: 3 additions & 0 deletions muckrock/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,3 +993,6 @@ def show_toolbar(request):

# APIv1 Killswitch
ENABLE_API_V1 = boolcheck(os.environ.get("ENABLE_API_V1", True))

# User agent to identify our service
SERVICE_USER_AGENT = os.environ.get("SERVICE_USER_AGENT", "muckrock requests")
Loading