From 9fa4307624f82cbf109efbc9c11dbe630dbfd415 Mon Sep 17 00:00:00 2001 From: Ross Blair Date: Tue, 10 Jun 2025 12:45:30 -0500 Subject: [PATCH 01/14] Add simple generate summaries and totals functions that group by directory name instead of by dandiset. --- .../_command_line_interface/_cli.py | 17 +- .../extractors/_s3_log_access_extractor.py | 4 +- src/s3_log_extraction/summarize/__init__.py | 13 +- .../_generate_all_dataset_summaries.py | 166 ++++++++++++++++++ .../summarize/_generate_all_dataset_totals.py | 57 ++++++ 5 files changed, 242 insertions(+), 15 deletions(-) create mode 100644 src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py create mode 100644 src/s3_log_extraction/summarize/_generate_all_dataset_totals.py diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index 231c5e72..008ccb4c 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -4,12 +4,14 @@ import click -from ..config import reset_extraction, set_cache_directory +from ..config import get_cache_directory, get_summary_directory, reset_extraction, set_cache_directory from ..extractors import DandiS3LogAccessExtractor, S3LogAccessExtractor, stop_extraction from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( generate_all_dandiset_summaries, generate_all_dandiset_totals, + generate_all_dataset_summaries, + generate_all_dataset_totals, generate_archive_summaries, generate_archive_totals, ) @@ -170,6 +172,7 @@ def _update_ip_coordinates_cli() -> None: "--mode", help=( "Generate condensed summaries of activity across the extracted data per object key. " + "Defaults to grouping summaries by top level prefix." "Mode 'dandi' will map asset hashes to Dandisets and their content filenames. " "Mode 'archive' aggregates over all dataset summaries." ), @@ -180,17 +183,14 @@ def _update_ip_coordinates_cli() -> None: def _update_summaries_cli(mode: typing.Literal["dandi", "archive"] | None = None) -> None: """ Generate condensed summaries of activity. - - TODO """ match mode: case "dandi": generate_all_dandiset_summaries() case "archive": - generate_archive_summaries() + generate_archive_summaries(get_summary_directory()) case _: - message = "The generic mode is not yet implemented - please raise an issue to discuss." - click.echo(message=message, err=True) + generate_all_dataset_summaries() # s3logextraction update totals @@ -211,7 +211,6 @@ def _update_totals_cli(mode: typing.Literal["dandi", "archive"] | None = None) - case "dandi": generate_all_dandiset_totals() case "archive": - generate_archive_totals() + generate_archive_totals(get_summary_directory()) case _: - message = "The generic mode is not yet implemented - please raise an issue to discuss." - click.echo(message=message, err=True) + generate_all_dataset_totals() diff --git a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py index cde07230..dc248e85 100644 --- a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py +++ b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py @@ -128,7 +128,9 @@ def extract_directory(self, *, directory: str | pathlib.Path, limit: int | None directory = pathlib.Path(directory) all_log_files = { - str(file_path.absolute()) for file_path in natsort.natsorted(seq=directory.rglob(pattern="*.log")) + str(file_path.absolute()) + for file_path in natsort.natsorted(seq=directory.rglob(pattern="*")) + if file_path.is_file() } unextracted_files = all_log_files - set(self.file_processing_end_record.keys()) diff --git a/src/s3_log_extraction/summarize/__init__.py b/src/s3_log_extraction/summarize/__init__.py index 205b6b9b..dd77d628 100644 --- a/src/s3_log_extraction/summarize/__init__.py +++ b/src/s3_log_extraction/summarize/__init__.py @@ -1,12 +1,15 @@ -from ._generate_archive_totals import generate_archive_totals -from ._generate_archive_summaries import generate_archive_summaries -from ._generate_all_dandiset_totals import generate_all_dandiset_totals - from ._generate_all_dandiset_summaries import generate_all_dandiset_summaries +from ._generate_all_dandiset_totals import generate_all_dandiset_totals +from ._generate_all_dataset_summaries import generate_all_dataset_summaries +from ._generate_all_dataset_totals import generate_all_dataset_totals +from ._generate_archive_summaries import generate_archive_summaries +from ._generate_archive_totals import generate_archive_totals __all__ = [ "generate_all_dandiset_summaries", "generate_all_dandiset_totals", - "generate_archive_totals", + "generate_all_dataset_summaries", + "generate_all_dataset_totals", "generate_archive_summaries", + "generate_archive_totals", ] diff --git a/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py new file mode 100644 index 00000000..75504bf3 --- /dev/null +++ b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py @@ -0,0 +1,166 @@ +import collections +import datetime +import pathlib + +import pandas +import tqdm +import yaml + +from ..config import get_cache_directory, get_extraction_directory, get_summary_directory +from ..ip_utils import load_ip_cache + + +def generate_all_dataset_summaries() -> None: + extraction_directory = get_extraction_directory() + + datasets = [item for item in extraction_directory.iterdir() if item.is_dir()] + + summary_directory = get_summary_directory() + index_to_region = load_ip_cache(cache_type="index_to_region") + + for dataset in tqdm.tqdm( + iterable=datasets, + total=len(datasets), + desc="Summarizing Datasets", + position=0, + leave=True, + mininterval=5.0, + smoothing=0, + unit="dataset", + ): + dataset_id = dataset.name + asset_directories = [file_path for file_path in dataset.rglob(pattern="*") if file_path.is_dir()] + _summarize_dataset( + dataset_id=dataset_id, + asset_directories=asset_directories, + summary_directory=summary_directory, + index_to_region=index_to_region, + ) + + +def _summarize_dataset( + *, + dataset_id: str, + asset_directories: list[pathlib.Path], + summary_directory: pathlib.Path, + index_to_region: dict[int, str], +) -> None: + _summarize_dataset_by_day( + asset_directories=asset_directories, summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_day.tsv" + ) + _summarize_dataset_by_asset( + asset_directories=asset_directories, + summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_asset.tsv", + ) + _summarize_dataset_by_region( + asset_directories=asset_directories, + summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_region.tsv", + index_to_region=index_to_region, + ) + + +def _summarize_dataset_by_day(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None: + all_dates = [] + all_bytes_sent = [] + for asset_directory in asset_directories: + # TODO: Could add a step here to track which object IDs have been processed, and if encountered again + # Just copy the file over instead of reprocessing + + timestamps_file_path = asset_directory / "timestamps.txt" + + if not timestamps_file_path.exists(): + continue + + dates = [ + datetime.datetime.strptime(str(timestamp.strip()), "%y%m%d%H%M%S").strftime(format="%Y-%m-%d") + for timestamp in timestamps_file_path.read_text().splitlines() + ] + all_dates.extend(dates) + + bytes_sent_file_path = asset_directory / "bytes_sent.txt" + bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()] + all_bytes_sent.extend(bytes_sent) + + summarized_activity_by_day = collections.defaultdict(int) + for date, bytes_sent in zip(all_dates, all_bytes_sent): + summarized_activity_by_day[date] += bytes_sent + + if len(summarized_activity_by_day) == 0: + return + + summary_file_path.parent.mkdir(parents=True, exist_ok=True) + summary_table = pandas.DataFrame( + data={ + "date": list(summarized_activity_by_day.keys()), + "bytes_sent": list(summarized_activity_by_day.values()), + } + ) + summary_table.sort_values(by="date", inplace=True) + summary_table.index = range(len(summary_table)) + summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True) + + +def _summarize_dataset_by_asset(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None: + summarized_activity_by_asset = collections.defaultdict(int) + for asset_directory in asset_directories: + # TODO: Could add a step here to track which object IDs have been processed, and if encountered again + # Just copy the file over instead of reprocessing + bytes_sent_file_path = asset_directory / "bytes_sent.txt" + + if not bytes_sent_file_path.exists(): + continue + + bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()] + + asset_path = str(asset_directory) + summarized_activity_by_asset[asset_path] += sum(bytes_sent) + + if len(summarized_activity_by_asset) == 0: + return + + summary_file_path.parent.mkdir(parents=True, exist_ok=True) + summary_table = pandas.DataFrame( + data={ + "asset_path": list(summarized_activity_by_asset.keys()), + "bytes_sent": list(summarized_activity_by_asset.values()), + } + ) + summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True) + + +def _summarize_dataset_by_region( + *, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path, index_to_region: dict[int, str] +) -> None: + all_regions = [] + all_bytes_sent = [] + for asset_directory in asset_directories: + # TODO: Could add a step here to track which object IDs have been processed, and if encountered again + # Just copy the file over instead of reprocessing + indexed_ips_file_path = asset_directory / "indexed_ips.txt" + + if not indexed_ips_file_path.exists(): + continue + + indexed_ips = [ip_index.strip() for ip_index in indexed_ips_file_path.read_text().splitlines()] + regions = [index_to_region.get(ip_index.strip(), "unknown") for ip_index in indexed_ips] + all_regions.extend(regions) + + bytes_sent_file_path = asset_directory / "bytes_sent.txt" + bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()] + all_bytes_sent.extend(bytes_sent) + + summarized_activity_by_region = collections.defaultdict(int) + for region, bytes_sent in zip(all_regions, all_bytes_sent): + summarized_activity_by_region[region] += bytes_sent + + if len(summarized_activity_by_region) == 0: + return + + summary_file_path.parent.mkdir(parents=True, exist_ok=True) + summary_table = pandas.DataFrame( + data={ + "region": list(summarized_activity_by_region.keys()), + "bytes_sent": list(summarized_activity_by_region.values()), + } + ) + summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True) diff --git a/src/s3_log_extraction/summarize/_generate_all_dataset_totals.py b/src/s3_log_extraction/summarize/_generate_all_dataset_totals.py new file mode 100644 index 00000000..8f727013 --- /dev/null +++ b/src/s3_log_extraction/summarize/_generate_all_dataset_totals.py @@ -0,0 +1,57 @@ +import json +import pathlib + +import pandas + +from ..config import get_summary_directory + + +def generate_all_dataset_totals( + summary_directory: str | pathlib.Path | None = None, +) -> None: + """ + Generate top-level totals of summarized access activity for all dandisets. + + Parameters + ---------- + summary_directory : pathlib.Path + Path to the folder containing all Dandiset summaries of the S3 access logs. + """ + if summary_directory: + summary_directory = pathlib.Path(summary_directory) + else: + summary_directory = get_summary_directory() + + # TODO: record progress over + + all_dandiset_totals = {} + for dandiset_id_folder_path in summary_directory.iterdir(): + if not dandiset_id_folder_path.is_dir(): + continue # TODO: use better structure for separating mapped activity from summaries + dandiset_id = dandiset_id_folder_path.name + + summary_file_path = summary_directory / dandiset_id / "dandiset_summary_by_region.tsv" + summary = pandas.read_table(filepath_or_buffer=summary_file_path) + + unique_countries = {} + for region in summary["region"]: + if region in ["VPN", "GitHub", "unknown"]: + continue + + country_code, region_name = region.split("/") + if "AWS" in country_code: + country_code = region_name.split("-")[0].upper() + + unique_countries[country_code] = True + + number_of_unique_regions = len(summary["region"]) + number_of_unique_countries = len(unique_countries) + all_dandiset_totals[dandiset_id] = { + "total_bytes_sent": int(summary["bytes_sent"].sum()), + "number_of_unique_regions": number_of_unique_regions, + "number_of_unique_countries": number_of_unique_countries, + } + + top_level_summary_file_path = summary_directory / "all_dandiset_totals.json" + with top_level_summary_file_path.open(mode="w") as io: + json.dump(obj=all_dandiset_totals, fp=io) From a3e1f7132ccef1407092c102d81742a6fbffa851 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Jun 2025 18:00:37 +0000 Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/s3_log_extraction/_command_line_interface/_cli.py | 2 +- .../summarize/_generate_all_dataset_summaries.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index 008ccb4c..afad0046 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -4,7 +4,7 @@ import click -from ..config import get_cache_directory, get_summary_directory, reset_extraction, set_cache_directory +from ..config import get_summary_directory, reset_extraction, set_cache_directory from ..extractors import DandiS3LogAccessExtractor, S3LogAccessExtractor, stop_extraction from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( diff --git a/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py index 75504bf3..3e3e1493 100644 --- a/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py +++ b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py @@ -4,9 +4,8 @@ import pandas import tqdm -import yaml -from ..config import get_cache_directory, get_extraction_directory, get_summary_directory +from ..config import get_extraction_directory, get_summary_directory from ..ip_utils import load_ip_cache @@ -46,7 +45,8 @@ def _summarize_dataset( index_to_region: dict[int, str], ) -> None: _summarize_dataset_by_day( - asset_directories=asset_directories, summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_day.tsv" + asset_directories=asset_directories, + summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_day.tsv", ) _summarize_dataset_by_asset( asset_directories=asset_directories, From e4566a466667477b16bbd2b6e398e01bce10764a Mon Sep 17 00:00:00 2001 From: Ross Blair Date: Wed, 18 Jun 2025 14:50:55 -0500 Subject: [PATCH 03/14] Update src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py Co-authored-by: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> --- .../summarize/_generate_all_dataset_summaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py index 3e3e1493..63ffd587 100644 --- a/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py +++ b/src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py @@ -9,7 +9,7 @@ from ..ip_utils import load_ip_cache -def generate_all_dataset_summaries() -> None: +def generate_summaries(level: int = 0) -> None: extraction_directory = get_extraction_directory() datasets = [item for item in extraction_directory.iterdir() if item.is_dir()] From 91c663f9a45e852e276bdc703ac9a5f46ce8753c Mon Sep 17 00:00:00 2001 From: Ross Blair Date: Wed, 18 Jun 2025 15:00:09 -0500 Subject: [PATCH 04/14] pass dtype into rng.integers on new_index collision to match original rng.integers call --- src/s3_log_extraction/ip_utils/_index_ips.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/s3_log_extraction/ip_utils/_index_ips.py b/src/s3_log_extraction/ip_utils/_index_ips.py index 2f8bb015..18ebb6f4 100644 --- a/src/s3_log_extraction/ip_utils/_index_ips.py +++ b/src/s3_log_extraction/ip_utils/_index_ips.py @@ -40,7 +40,7 @@ def index_ips(*, seed: int = 0) -> None: redraw = 0 while index_to_ip.get(new_index, None) is not None and redraw < max_redraws: - new_index = int(rng.integers(low=0, high=high)) + new_index = int(rng.integers(low=0, high=high, size=1, dtype=dtype)) redraw += 1 if redraw >= max_redraws: From 94381d73ee5da2dff80c1df1e65f87a78e17132b Mon Sep 17 00:00:00 2001 From: Ross Blair Date: Wed, 18 Jun 2025 15:03:06 -0500 Subject: [PATCH 05/14] make regex for log file name pattern a property of the extractor class --- .../extractors/_dandi_s3_log_access_extractor.py | 2 ++ src/s3_log_extraction/extractors/_s3_log_access_extractor.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py b/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py index 12a63c06..6c88564a 100644 --- a/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py +++ b/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py @@ -38,3 +38,5 @@ def __init__(self, cache_directory: pathlib.Path | None = None) -> None: ips_to_skip_regex = decrypt_bytes(encrypted_data=DROGON_IP_REGEX_ENCRYPTED) self._awk_env["IPS_TO_SKIP_REGEX"] = ips_to_skip_regex.decode("utf-8") + + self.log_pattern = "*.log" diff --git a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py index dc248e85..9a5ca703 100644 --- a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py +++ b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py @@ -55,6 +55,8 @@ def __init__(self, *, cache_directory: pathlib.Path | None = None) -> None: file_processing_end_record_file_name = f"{class_name}_file-processing-end.txt" self.file_processing_end_record_file_path = self.records_directory / file_processing_end_record_file_name + self.log_pattern = "*-*-*-*-*-*-*" + # TODO: does this hold after bundling? awk_filename = "_generic_extraction.awk" if sys.platform != "win32" else "_generic_extraction_windows.awk" self._relative_script_path = pathlib.Path(__file__).parent / awk_filename @@ -129,7 +131,7 @@ def extract_directory(self, *, directory: str | pathlib.Path, limit: int | None all_log_files = { str(file_path.absolute()) - for file_path in natsort.natsorted(seq=directory.rglob(pattern="*")) + for file_path in natsort.natsorted(seq=directory.rglob(pattern=self.log_pattern)) if file_path.is_file() } unextracted_files = all_log_files - set(self.file_processing_end_record.keys()) From 0e719e670f7dc0d121f61f332b74c411014a6a86 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Fri, 20 Jun 2025 16:48:40 -0400 Subject: [PATCH 06/14] Update src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py --- .../extractors/_dandi_s3_log_access_extractor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py b/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py index 7c5c1d60..970b2c58 100644 --- a/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py +++ b/src/s3_log_extraction/extractors/_dandi_s3_log_access_extractor.py @@ -38,5 +38,3 @@ def __init__(self, cache_directory: pathlib.Path | None = None) -> None: ips_to_skip_regex = decrypt_bytes(encrypted_data=DROGON_IP_REGEX_ENCRYPTED) self._awk_env["IPS_TO_SKIP_REGEX"] = ips_to_skip_regex.decode("utf-8") - - self.log_pattern = "*.log" From c06645a6f128fd9f69269b605747dfb3bda7c4b4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 23 Jun 2025 23:34:57 +0000 Subject: [PATCH 07/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/s3_log_extraction/_command_line_interface/_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index d2befc06..82de83db 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -5,7 +5,6 @@ import click - from ..config import get_summary_directory, reset_extraction, set_cache_directory from ..extractors import DandiS3LogAccessExtractor, RemoteS3LogAccessExtractor, S3LogAccessExtractor, stop_extraction from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates From 51f94da58183fc437d99de80812a8d9c7b4c3814 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 26 Jun 2025 12:33:59 -0400 Subject: [PATCH 08/14] resolve conflict --- src/s3_log_extraction/_command_line_interface/_cli.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index ed8b18ff..58d72ef7 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -7,9 +7,6 @@ <<<<<<< enh/directory_based_totals from ..config import get_summary_directory, reset_extraction, set_cache_directory -from ..extractors import DandiS3LogAccessExtractor, RemoteS3LogAccessExtractor, S3LogAccessExtractor, stop_extraction -======= -from ..config import reset_extraction, set_cache_directory from ..extractors import ( DandiRemoteS3LogAccessExtractor, DandiS3LogAccessExtractor, From d5aa76fa6106f916b71a78b7c281ae221d400f8d Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 26 Jun 2025 12:37:08 -0400 Subject: [PATCH 09/14] chore: resolve conflict --- src/s3_log_extraction/_command_line_interface/_cli.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index 58d72ef7..596df59f 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -5,7 +5,6 @@ import click -<<<<<<< enh/directory_based_totals from ..config import get_summary_directory, reset_extraction, set_cache_directory from ..extractors import ( DandiRemoteS3LogAccessExtractor, @@ -14,7 +13,6 @@ S3LogAccessExtractor, stop_extraction, ) ->>>>>>> main from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( generate_all_dandiset_summaries, From 89f79a45e58eb692711678075a8cd1609f325afc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 30 Jun 2025 10:08:12 +0000 Subject: [PATCH 10/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/s3_log_extraction/extractors/_s3_log_access_extractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py index 9aeb03db..4ca1c537 100644 --- a/src/s3_log_extraction/extractors/_s3_log_access_extractor.py +++ b/src/s3_log_extraction/extractors/_s3_log_access_extractor.py @@ -193,7 +193,6 @@ def extract_file( with self.file_processing_end_record_file_path.open(mode="a") as file_stream: file_stream.write(content) - def _run_extraction(self, *, file_path: pathlib.Path, extraction_directory: pathlib.Path | None = None) -> None: if extraction_directory is not None: self._awk_env["EXTRACTION_DIRECTORY"] = str(extraction_directory) From 9ae6c73e7cc949eb97a3a7f55c1a524effa88119 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Jul 2025 05:28:36 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/s3_log_extraction/_command_line_interface/_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index 0b8167a0..20128a44 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -16,7 +16,6 @@ ) from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( - generate__dataset_summaries, generate_all_dataset_totals, generate_archive_summaries, generate_archive_totals, From a892f1dbb9106b7e1df5c90624722e6daee6cbce Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 10 Jul 2025 01:32:57 -0400 Subject: [PATCH 12/14] Update __init__.py --- src/s3_log_extraction/summarize/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/s3_log_extraction/summarize/__init__.py b/src/s3_log_extraction/summarize/__init__.py index 19bc11d9..c4c5860f 100644 --- a/src/s3_log_extraction/summarize/__init__.py +++ b/src/s3_log_extraction/summarize/__init__.py @@ -1,6 +1,6 @@ from ._generate_all_dandiset_summaries import generate_all_dandiset_summaries from ._generate_all_dandiset_totals import generate_all_dandiset_totals -from ._generate_all_dataset_summaries import generate_all_dataset_summaries +from ._generate_all_dataset_summaries import generate_summaries from ._generate_all_dataset_totals import generate_all_dataset_totals from ._generate_archive_summaries import generate_archive_summaries from ._generate_archive_totals import generate_archive_totals @@ -10,10 +10,10 @@ from ._generate_dandiset_summaries import generate_dandiset_summaries __all__ = [ - "generate_dandiset_summaries", + "generate_summaries", "generate_all_dandiset_totals", + "generate_dandiset_summaries", "generate_dandiset_totals", - "generate_archive_totals", "generate_archive_summaries", "generate_archive_totals", ] From a7dd8a370adfa67d065303033f1e6c7ab1e2165a Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Thu, 10 Jul 2025 01:42:38 -0400 Subject: [PATCH 13/14] Update _cli.py --- src/s3_log_extraction/_command_line_interface/_cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index 20128a44..c33b6a84 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -16,6 +16,7 @@ ) from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( + generate_summaries, generate_all_dataset_totals, generate_archive_summaries, generate_archive_totals, @@ -257,7 +258,7 @@ def _update_summaries_cli( case "archive": generate_archive_summaries(get_summary_directory()) case _: - generate_all_dataset_summaries() + generate_summaries() # s3logextraction update totals From f4f6066f15022d9a15e20f687982a775e119b221 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Jul 2025 05:42:45 +0000 Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/s3_log_extraction/_command_line_interface/_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/s3_log_extraction/_command_line_interface/_cli.py b/src/s3_log_extraction/_command_line_interface/_cli.py index c33b6a84..6305e437 100644 --- a/src/s3_log_extraction/_command_line_interface/_cli.py +++ b/src/s3_log_extraction/_command_line_interface/_cli.py @@ -16,12 +16,12 @@ ) from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates from ..summarize import ( - generate_summaries, generate_all_dataset_totals, generate_archive_summaries, generate_archive_totals, generate_dandiset_summaries, generate_dandiset_totals, + generate_summaries, ) from ..testing import generate_benchmark from ..validate import (