Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
9fa4307
Add simple generate summaries and totals functions that group by dire…
rwblair Jun 10, 2025
a3e1f71
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 10, 2025
948b171
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 11, 2025
2b71944
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 15, 2025
e4566a4
Update src/s3_log_extraction/summarize/_generate_all_dataset_summarie…
rwblair Jun 18, 2025
91c663f
pass dtype into rng.integers on new_index collision to match original…
rwblair Jun 18, 2025
94381d7
make regex for log file name pattern a property of the extractor class
rwblair Jun 18, 2025
1ff2b9c
Merge branch 'enh/directory_based_totals' of github.com:rwblair/s3-lo…
rwblair Jun 18, 2025
385d211
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 20, 2025
0e719e6
Update src/s3_log_extraction/extractors/_dandi_s3_log_access_extracto…
CodyCBakerPhD Jun 20, 2025
aeca81f
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 23, 2025
15ce9bf
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 23, 2025
c06645a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 23, 2025
ad3ce93
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 26, 2025
51f94da
resolve conflict
CodyCBakerPhD Jun 26, 2025
d5aa76f
chore: resolve conflict
CodyCBakerPhD Jun 26, 2025
bb39862
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 30, 2025
89f79a4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 30, 2025
345f852
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jun 30, 2025
0edea33
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 10, 2025
9ae6c73
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 10, 2025
a892f1d
Update __init__.py
CodyCBakerPhD Jul 10, 2025
a7dd8a3
Update _cli.py
CodyCBakerPhD Jul 10, 2025
f4f6066
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jul 10, 2025
62a7cdc
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 18, 2025
f8d9ac1
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 21, 2025
4d98bee
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 21, 2025
0834b64
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 22, 2025
fefb9a9
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 28, 2025
f29b0cb
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 28, 2025
b38db38
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Jul 28, 2025
95836e5
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Aug 5, 2025
4c47bcf
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Aug 20, 2025
880549f
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Aug 21, 2025
c0582f8
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Aug 25, 2025
2fc79dc
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Sep 1, 2025
f0fb37c
Merge branch 'main' into enh/directory_based_totals
CodyCBakerPhD Sep 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/s3_log_extraction/_command_line_interface/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import click
import pydantic

from ..config import reset_extraction, set_cache_directory
from ..config import get_summary_directory, reset_extraction, set_cache_directory
from ..database import bundle_database
from ..extractors import (
DandiRemoteS3LogAccessExtractor,
Expand All @@ -17,10 +17,12 @@
)
from ..ip_utils import index_ips, update_index_to_region_codes, update_region_code_coordinates
from ..summarize import (
generate_all_dataset_totals,
generate_archive_summaries,
generate_archive_totals,
generate_dandiset_summaries,
generate_dandiset_totals,
generate_summaries,
)
from ..testing import generate_benchmark
from ..validate import (
Expand Down Expand Up @@ -223,6 +225,7 @@ def _update_ip_coordinates_cli() -> None:
"--mode",
help=(
"Generate condensed summaries of activity across the extracted data per object key. "
"Defaults to grouping summaries by top level prefix."
"Mode 'dandi' will map asset hashes to Dandisets and their content filenames. "
"Mode 'archive' aggregates over all dataset summaries."
),
Expand Down Expand Up @@ -268,10 +271,9 @@ def _update_summaries_cli(
skip_as_list = skip.split(",") if skip is not None else None
generate_dandiset_summaries(pick=pick_as_list, skip=skip_as_list, workers=workers)
case "archive":
generate_archive_summaries()
generate_archive_summaries(get_summary_directory())
case _:
message = "The generic mode is not yet implemented - please raise an issue to discuss."
click.echo(message=message, err=True)
generate_summaries()


# s3logextraction update database
Expand Down Expand Up @@ -299,10 +301,9 @@ def _update_totals_cli(mode: typing.Literal["dandi", "archive"] | None = None) -
case "dandi":
generate_dandiset_totals()
case "archive":
generate_archive_totals()
generate_archive_totals(get_summary_directory())
case _:
message = "The generic mode is not yet implemented - please raise an issue to discuss."
click.echo(message=message, err=True)
generate_all_dataset_totals()


# s3logextraction testing
Expand Down
10 changes: 9 additions & 1 deletion src/s3_log_extraction/summarize/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
from ._generate_all_dandiset_summaries import generate_all_dandiset_summaries
from ._generate_all_dandiset_totals import generate_all_dandiset_totals
from ._generate_all_dataset_summaries import generate_summaries
from ._generate_all_dataset_totals import generate_all_dataset_totals
from ._generate_archive_summaries import generate_archive_summaries
from ._generate_archive_totals import generate_archive_totals
from ._generate_archive_totals import generate_archive_totals
from ._generate_archive_summaries import generate_archive_summaries
from ._generate_dandiset_totals import generate_dandiset_totals
from ._generate_dandiset_summaries import generate_dandiset_summaries

__all__ = [
"generate_summaries",
"generate_all_dandiset_totals",
"generate_dandiset_summaries",
"generate_dandiset_totals",
"generate_archive_totals",
"generate_archive_summaries",
"generate_archive_totals",
]
166 changes: 166 additions & 0 deletions src/s3_log_extraction/summarize/_generate_all_dataset_summaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import collections
import datetime
import pathlib

import pandas
import tqdm

from ..config import get_extraction_directory, get_summary_directory
from ..ip_utils import load_ip_cache


def generate_summaries(level: int = 0) -> None:
extraction_directory = get_extraction_directory()

datasets = [item for item in extraction_directory.iterdir() if item.is_dir()]

summary_directory = get_summary_directory()
index_to_region = load_ip_cache(cache_type="index_to_region")

for dataset in tqdm.tqdm(
iterable=datasets,
total=len(datasets),
desc="Summarizing Datasets",
position=0,
leave=True,
mininterval=5.0,
smoothing=0,
unit="dataset",
):
dataset_id = dataset.name
asset_directories = [file_path for file_path in dataset.rglob(pattern="*") if file_path.is_dir()]
_summarize_dataset(
dataset_id=dataset_id,
asset_directories=asset_directories,
summary_directory=summary_directory,
index_to_region=index_to_region,
)


def _summarize_dataset(
*,
dataset_id: str,
asset_directories: list[pathlib.Path],
summary_directory: pathlib.Path,
index_to_region: dict[int, str],
) -> None:
_summarize_dataset_by_day(
asset_directories=asset_directories,
summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_day.tsv",
)
_summarize_dataset_by_asset(
asset_directories=asset_directories,
summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_asset.tsv",
)
_summarize_dataset_by_region(
asset_directories=asset_directories,
summary_file_path=summary_directory / dataset_id / "dandiset_summary_by_region.tsv",
index_to_region=index_to_region,
)


def _summarize_dataset_by_day(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None:
all_dates = []
all_bytes_sent = []
for asset_directory in asset_directories:
# TODO: Could add a step here to track which object IDs have been processed, and if encountered again
# Just copy the file over instead of reprocessing

timestamps_file_path = asset_directory / "timestamps.txt"

if not timestamps_file_path.exists():
continue

dates = [
datetime.datetime.strptime(str(timestamp.strip()), "%y%m%d%H%M%S").strftime(format="%Y-%m-%d")
for timestamp in timestamps_file_path.read_text().splitlines()
]
all_dates.extend(dates)

bytes_sent_file_path = asset_directory / "bytes_sent.txt"
bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()]
all_bytes_sent.extend(bytes_sent)

summarized_activity_by_day = collections.defaultdict(int)
for date, bytes_sent in zip(all_dates, all_bytes_sent):
summarized_activity_by_day[date] += bytes_sent

if len(summarized_activity_by_day) == 0:
return

summary_file_path.parent.mkdir(parents=True, exist_ok=True)
summary_table = pandas.DataFrame(
data={
"date": list(summarized_activity_by_day.keys()),
"bytes_sent": list(summarized_activity_by_day.values()),
}
)
summary_table.sort_values(by="date", inplace=True)
summary_table.index = range(len(summary_table))
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True)


def _summarize_dataset_by_asset(*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path) -> None:
summarized_activity_by_asset = collections.defaultdict(int)
for asset_directory in asset_directories:
# TODO: Could add a step here to track which object IDs have been processed, and if encountered again
# Just copy the file over instead of reprocessing
bytes_sent_file_path = asset_directory / "bytes_sent.txt"

if not bytes_sent_file_path.exists():
continue

bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()]

asset_path = str(asset_directory)
summarized_activity_by_asset[asset_path] += sum(bytes_sent)

if len(summarized_activity_by_asset) == 0:
return

summary_file_path.parent.mkdir(parents=True, exist_ok=True)
summary_table = pandas.DataFrame(
data={
"asset_path": list(summarized_activity_by_asset.keys()),
"bytes_sent": list(summarized_activity_by_asset.values()),
}
)
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True)


def _summarize_dataset_by_region(
*, asset_directories: list[pathlib.Path], summary_file_path: pathlib.Path, index_to_region: dict[int, str]
) -> None:
all_regions = []
all_bytes_sent = []
for asset_directory in asset_directories:
# TODO: Could add a step here to track which object IDs have been processed, and if encountered again
# Just copy the file over instead of reprocessing
indexed_ips_file_path = asset_directory / "indexed_ips.txt"

if not indexed_ips_file_path.exists():
continue

indexed_ips = [ip_index.strip() for ip_index in indexed_ips_file_path.read_text().splitlines()]
regions = [index_to_region.get(ip_index.strip(), "unknown") for ip_index in indexed_ips]
all_regions.extend(regions)

bytes_sent_file_path = asset_directory / "bytes_sent.txt"
bytes_sent = [int(value.strip()) for value in bytes_sent_file_path.read_text().splitlines()]
all_bytes_sent.extend(bytes_sent)

summarized_activity_by_region = collections.defaultdict(int)
for region, bytes_sent in zip(all_regions, all_bytes_sent):
summarized_activity_by_region[region] += bytes_sent

if len(summarized_activity_by_region) == 0:
return

summary_file_path.parent.mkdir(parents=True, exist_ok=True)
summary_table = pandas.DataFrame(
data={
"region": list(summarized_activity_by_region.keys()),
"bytes_sent": list(summarized_activity_by_region.values()),
}
)
summary_table.to_csv(path_or_buf=summary_file_path, mode="w", sep="\t", header=True, index=True)
57 changes: 57 additions & 0 deletions src/s3_log_extraction/summarize/_generate_all_dataset_totals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import json
import pathlib

import pandas

from ..config import get_summary_directory


def generate_all_dataset_totals(
summary_directory: str | pathlib.Path | None = None,
) -> None:
"""
Generate top-level totals of summarized access activity for all dandisets.

Parameters
----------
summary_directory : pathlib.Path
Path to the folder containing all Dandiset summaries of the S3 access logs.
"""
if summary_directory:
summary_directory = pathlib.Path(summary_directory)
else:
summary_directory = get_summary_directory()

# TODO: record progress over

all_dandiset_totals = {}
for dandiset_id_folder_path in summary_directory.iterdir():
if not dandiset_id_folder_path.is_dir():
continue # TODO: use better structure for separating mapped activity from summaries
dandiset_id = dandiset_id_folder_path.name

summary_file_path = summary_directory / dandiset_id / "dandiset_summary_by_region.tsv"
summary = pandas.read_table(filepath_or_buffer=summary_file_path)

unique_countries = {}
for region in summary["region"]:
if region in ["VPN", "GitHub", "unknown"]:
continue

country_code, region_name = region.split("/")
if "AWS" in country_code:
country_code = region_name.split("-")[0].upper()

unique_countries[country_code] = True

number_of_unique_regions = len(summary["region"])
number_of_unique_countries = len(unique_countries)
all_dandiset_totals[dandiset_id] = {
"total_bytes_sent": int(summary["bytes_sent"].sum()),
"number_of_unique_regions": number_of_unique_regions,
"number_of_unique_countries": number_of_unique_countries,
}

top_level_summary_file_path = summary_directory / "all_dandiset_totals.json"
with top_level_summary_file_path.open(mode="w") as io:
json.dump(obj=all_dandiset_totals, fp=io)