Skip to content

Commit

Permalink
Merge pull request #579 from catalyst-cooperative/marianneke-cleanup-…
Browse files Browse the repository at this point in the history
…logging

cleanup logging using class defined logger
  • Loading branch information
e-belfer authored Feb 5, 2025
2 parents c931fc1 + 8a863fc commit 22783e3
Show file tree
Hide file tree
Showing 12 changed files with 23 additions and 55 deletions.
17 changes: 8 additions & 9 deletions src/pudl_archiver/archivers/eia/eia176.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
"""

import asyncio
import logging
import random
import zipfile

Expand All @@ -20,8 +19,6 @@
from pudl_archiver.frictionless import ZipLayout
from pudl_archiver.utils import add_to_archive_stable_hash

logger = logging.getLogger(f"catalystcoop.{__name__}")

USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0",
Expand All @@ -39,7 +36,7 @@ class Eia176Archiver(EiaNGQVArchiver):

async def get_items(self, url: str = items_url) -> list[str]:
"""Get list of item codes from EIA NQGV portal."""
logger.info("Getting list of items in Form 176.")
self.logger.info("Getting list of items in Form 176.")
items_response = await self.get_json(url)
items_list = [item["item"] for item in items_response]
return items_list
Expand Down Expand Up @@ -78,7 +75,7 @@ async def get_year_resource(self, year: str, items_list: list[str]) -> ResourceI

for i in range(0, len(items_list), 20):
rand = random.randint(0, 2) # noqa: S311
logger.debug(f"Getting items {i}-{i + 20} of data for {year}")
self.logger.debug(f"Getting items {i}-{i + 20} of data for {year}")
# Chunk items list into 20 to avoid error message
download_url = self.data_url + f"{year}/{year}/ICA/Name/"
items = items_list[i : i + 20]
Expand All @@ -100,14 +97,16 @@ async def get_year_resource(self, year: str, items_list: list[str]) -> ResourceI
)
await asyncio.sleep(5) # Add sleep to prevent user-agent blocks

logger.info(f"Compiling data for {year}")
self.logger.info(f"Compiling data for {year}")
dataframe = pd.concat(dataframes)

# Rename columns. Instead of using year for value column, rename "value"
column_dict = {
item["field"]: str(item["headerName"]).lower()
if str(item["headerName"]).lower() != year
else "value"
item["field"]: (
str(item["headerName"]).lower()
if str(item["headerName"]).lower() != year
else "value"
)
for item in json_response["columns"]
}
dataframe = dataframe.rename(columns=column_dict).sort_values(
Expand Down
5 changes: 1 addition & 4 deletions src/pudl_archiver/archivers/eia/eia930.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Download EIA-930 data."""

import logging
from pathlib import Path

import pandas as pd
Expand All @@ -15,8 +14,6 @@
BASE_URL = "https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/"
FILE_LIST_URL = "https://www.eia.gov/electricity/gridmonitor/sixMonthFiles/EIA930_File_List_Meta.csv"

logger = logging.getLogger(f"catalystcoop.{__name__}")


class Eia930Archiver(AbstractDatasetArchiver):
"""EIA 930 archiver."""
Expand Down Expand Up @@ -47,7 +44,7 @@ async def get_year_resource(
self, file_list: pd.DataFrame, year=int, half_year=int
) -> tuple[Path, dict]:
"""Download zip file of all files in year."""
logger.debug(f"Downloading data for {year}half{half_year}.")
self.logger.debug(f"Downloading data for {year}half{half_year}.")
zip_path = self.download_directory / f"eia930-{year}half{half_year}.zip"
data_paths_in_archive = set()
period_files = file_list[
Expand Down
6 changes: 2 additions & 4 deletions src/pudl_archiver/archivers/eia/eiamecs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Archive EIA Manufacturing Energy Consumption Survey (MECS)."""

import logging
import re

from pudl_archiver.archivers.classes import (
Expand All @@ -11,7 +10,6 @@
from pudl_archiver.frictionless import ZipLayout

BASE_URL = "https://www.eia.gov/consumption/manufacturing/data"
logger = logging.getLogger(f"catalystcoop.{__name__}")

TABLE_LINK_PATTERNS: dict[str | int, str] = {
"recent": r"(RSE|)[Tt]able(\d{1,2}|\d{1.1})_(\d{1,2})(.xlsx|.xls)",
Expand Down Expand Up @@ -60,7 +58,7 @@ async def get_resources(self) -> ArchiveAwaitable:

async def get_year_resources(self, year: int) -> list[ResourceInfo]:
"""Download all excel tables for a year."""
logger.info(f"Attempting to find resources for: {year}")
self.logger.info(f"Attempting to find resources for: {year}")
data_paths_in_archive = set()
year_url = f"{BASE_URL}/{year}"
zip_path = self.download_directory / f"eiamecs-{year}.zip"
Expand All @@ -75,7 +73,7 @@ async def get_year_resources(self, year: int) -> list[ResourceInfo]:
# Loop through all download links for tables
for table_link in await self.get_hyperlinks(year_url, table_link_pattern):
table_link = f"{year_url}/{table_link}"
logger.info(f"Fetching {table_link}")
self.logger.info(f"Fetching {table_link}")
# We are going to rename the files in a standard format by extracting
# patterns from the table_link_pattern
# From 1998 and before there are a bunch of letters in the file names
Expand Down
3 changes: 0 additions & 3 deletions src/pudl_archiver/archivers/eia/eianems.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Download EIA NEMS Github respository."""

import logging
from pathlib import Path

from pudl_archiver.archivers.classes import (
Expand All @@ -9,8 +8,6 @@
ResourceInfo,
)

logger = logging.getLogger(f"catalystcoop.{__name__}")


class EiaNEMSArchiver(AbstractDatasetArchiver):
"""EIA NEMS archiver."""
Expand Down
2 changes: 0 additions & 2 deletions src/pudl_archiver/archivers/eia/eiawater.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Download EIA Thermal Cooling Water data."""

import logging
import re
from pathlib import Path

Expand All @@ -10,7 +9,6 @@
ResourceInfo,
)

logger = logging.getLogger(f"catalystcoop.{__name__}")
BASE_URL = "https://www.eia.gov/electricity/data/water"


Expand Down
5 changes: 1 addition & 4 deletions src/pudl_archiver/archivers/eia/naturalgas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Shared methods for data from EIA Natural Gas Quarterly Viewer (NGQV)."""

import logging
import zipfile
from collections.abc import Iterable

Expand All @@ -16,8 +15,6 @@
from pudl_archiver.frictionless import ZipLayout
from pudl_archiver.utils import add_to_archive_stable_hash

logger = logging.getLogger(f"catalystcoop.{__name__}")


class EIANaturalGasData(BaseModel):
"""Data transfer object from EIA NGQV."""
Expand Down Expand Up @@ -87,7 +84,7 @@ async def get_year_resource(

download_url = self.base_url + f"/{dataset.code}/data/{year}/{year}/ICA/Name"

logger.info(f"Retrieving data for {year}")
self.logger.info(f"Retrieving data for {year}")
json_response = await self.get_json(download_url)
dataframe = pd.DataFrame.from_dict(json_response["data"], orient="columns")

Expand Down
9 changes: 3 additions & 6 deletions src/pudl_archiver/archivers/epa/epacems.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import datetime
import json
import logging
import os
from collections.abc import Iterable
from itertools import groupby
Expand All @@ -18,8 +17,6 @@
)
from pudl_archiver.frictionless import ZipLayout

logger = logging.getLogger(f"catalystcoop.{__name__}")


class BulkFile(BaseModel):
"""Data transfer object from EPA.
Expand Down Expand Up @@ -95,8 +92,8 @@ async def get_resources(self) -> ArchiveAwaitable:
and (file.metadata.quarter in {1, 2, 3, 4})
and self.valid_year(file.metadata.year)
]
logger.info(f"Downloading {len(quarterly_emissions_files)} total files.")
logger.debug(f"File info: {quarterly_emissions_files}")
self.logger.info(f"Downloading {len(quarterly_emissions_files)} total files.")
self.logger.debug(f"File info: {quarterly_emissions_files}")
files_by_year = groupby(
sorted(quarterly_emissions_files, key=lambda bf: bf.metadata.year),
lambda bf: bf.metadata.year,
Expand All @@ -120,7 +117,7 @@ async def get_year_resource(
quarter = file.metadata.quarter

# Useful to debug at download time-outs.
logger.info(f"Downloading {year} Q{quarter} EPACEMS data from {url}.")
self.logger.info(f"Downloading {year} Q{quarter} EPACEMS data from {url}.")

filename = f"epacems-{year}q{quarter}.csv"
file_path = self.download_directory / filename
Expand Down
9 changes: 3 additions & 6 deletions src/pudl_archiver/archivers/epa/epamats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Download EPAMATS data."""

import json
import logging
import os
from collections.abc import Iterable
from itertools import groupby
Expand All @@ -17,8 +16,6 @@
from pudl_archiver.archivers.epa.epacems import BulkFile
from pudl_archiver.frictionless import ZipLayout

logger = logging.getLogger(f"catalystcoop.{__name__}")


class EpaMatsArchiver(AbstractDatasetArchiver):
"""EPA MATS archiver."""
Expand Down Expand Up @@ -62,8 +59,8 @@ async def get_resources(self) -> ArchiveAwaitable:
and (file.metadata.quarter in {1, 2, 3, 4})
and self.valid_year(file.metadata.year)
]
logger.info(f"Downloading {len(quarterly_emissions_files)} total files.")
logger.debug(f"File info: {quarterly_emissions_files}")
self.logger.info(f"Downloading {len(quarterly_emissions_files)} total files.")
self.logger.debug(f"File info: {quarterly_emissions_files}")
files_by_year = groupby(
sorted(quarterly_emissions_files, key=lambda bf: bf.metadata.year),
lambda bf: bf.metadata.year,
Expand All @@ -87,7 +84,7 @@ async def get_year_resource(
quarter = file.metadata.quarter

# Useful to debug at download time-outs.
logger.info(f"Downloading {year} Q{quarter} EPA MATS data from {url}.")
self.logger.info(f"Downloading {year} Q{quarter} EPA MATS data from {url}.")

filename = f"epamats-{year}q{quarter}.csv"
file_path = self.download_directory / filename
Expand Down
7 changes: 2 additions & 5 deletions src/pudl_archiver/archivers/gridpathratoolkit.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
It is archived from files stored in the private sources.catalyst.coop bucket.
"""

import logging
import zipfile
from pathlib import Path

Expand All @@ -18,8 +17,6 @@
from pudl_archiver.frictionless import ZipLayout
from pudl_archiver.utils import add_to_archive_stable_hash

logger = logging.getLogger(f"catalystcoop.{__name__}")


class GridPathRAToolkitArchiver(AbstractDatasetArchiver):
"""GridPath RA Toolkit renewable generation profiles archiver."""
Expand Down Expand Up @@ -84,7 +81,7 @@ async def get_gcs_resource(
# Download blob to local file
# We download the entire zipfile to avoid having to authenticate using
# a second GCS library, since GCS doesn't support fsspec file paths.
logger.info(f"Downloading {blob.name} to {path_to_file}")
self.logger.info(f"Downloading {blob.name} to {path_to_file}")

blob.download_to_filename(path_to_file)

Expand Down Expand Up @@ -121,7 +118,7 @@ async def get_and_zip_resources(
continue

# Download all files locally
logger.info(f"Downloading {blob.name} to {final_zipfile_name}")
self.logger.info(f"Downloading {blob.name} to {final_zipfile_name}")
string = blob.download_as_string()
add_to_archive_stable_hash(
archive=archive, filename=Path(blob.name).name, data=string
Expand Down
5 changes: 1 addition & 4 deletions src/pudl_archiver/archivers/mshamines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Download MSHA data."""

import logging
import re
from pathlib import Path

Expand All @@ -10,8 +9,6 @@
ResourceInfo,
)

logger = logging.getLogger(f"catalystcoop.{__name__}")

URL_BASE = "https://arlweb.msha.gov/OpenGovernmentData/"
EXT_BASE = "OGIMSHA.asp"

Expand Down Expand Up @@ -75,7 +72,7 @@ async def get_resources(self) -> ArchiveAwaitable:
links = [link.split("/")[-1] for link in links]
full_links = [URL_BASE + "DataSets/" + link for link in links]

logger.debug(full_links)
self.logger.debug(full_links)

if any(item not in list(set(MSHA_DATASETS.values())) for item in full_links):
# If a link to a new dataset is found, raise error.
Expand Down
5 changes: 1 addition & 4 deletions src/pudl_archiver/archivers/phmsagas.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Download PHMSHA data."""

import logging
import re
import typing
from pathlib import Path
Expand All @@ -12,8 +11,6 @@
ResourceInfo,
)

logger = logging.getLogger(f"catalystcoop.{__name__}")

BASE_URL = "https://www.phmsa.dot.gov/data-and-statistics/pipeline/gas-distribution-gas-gathering-gas-transmission-hazardous-liquids"

PHMSA_FORMS = [
Expand Down Expand Up @@ -75,7 +72,7 @@ async def get_zip_resource(
form = "_".join(filename.lower().split("_")[0:-2])

if form not in PHMSA_FORMS:
logger.warning(f"New form type found: {form}.")
self.logger.warning(f"New form type found: {form}.")

download_path = self.download_directory / f"{self.name}_{filename}.zip"
await self.download_zipfile(url, download_path)
Expand Down
5 changes: 1 addition & 4 deletions src/pudl_archiver/archivers/vcerare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from files stored in the private sources.catalyst.coop bucket.
"""

import logging
import re
from pathlib import Path

Expand All @@ -17,8 +16,6 @@
ResourceInfo,
)

logger = logging.getLogger(f"catalystcoop.{__name__}")


class VCERAREArchiver(AbstractDatasetArchiver):
"""VCE RARE data archiver."""
Expand Down Expand Up @@ -46,7 +43,7 @@ async def get_gcs_resource(self, blob: storage.Blob) -> tuple[Path, dict]:
file_name = blob.name.replace(f"{self.name}/", "")
path_to_file = self.download_directory / file_name
# Download blob to local file
logger.info(f"Downloading {blob.name} to {path_to_file}")
self.logger.info(f"Downloading {blob.name} to {path_to_file}")
blob.download_to_filename(path_to_file)

# Set up partitions:
Expand Down

0 comments on commit 22783e3

Please sign in to comment.