Skip to content

Commit 1ef3266

Browse files
authored
Merge pull request #151 from dreadnode/users/raja/update-export-runs-to-paginated
feat: Update export_runs to use paginated API with disk-based exports
2 parents 7f81c4e + 8e3c652 commit 1ef3266

File tree

6 files changed

+353
-92
lines changed

6 files changed

+353
-92
lines changed

docs/sdk/api.mdx

Lines changed: 59 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ def export_metrics(
170170
Returns:
171171
A DataFrame containing the exported metric data.
172172
"""
173-
import pandas as pd
174173

175174
response = self.request(
176175
"GET",
@@ -265,7 +264,6 @@ def export_parameters(
265264
Returns:
266265
A DataFrame containing the exported parameter data.
267266
"""
268-
import pandas as pd
269267

270268
response = self.request(
271269
"GET",
@@ -294,36 +292,48 @@ export_runs(
294292
filter: str | None = None,
295293
status: StatusFilter = "completed",
296294
aggregations: list[MetricAggregationType] | None = None,
297-
) -> pd.DataFrame
295+
format: ExportFormat = "parquet",
296+
base_dir: str | None = None,
297+
) -> str
298298
```
299299

300-
Exports run data for a specific project.
300+
Export runs using pagination - always writes to disk.
301301

302302
**Parameters:**
303303

304304
* **`project`**
305305
(`str`)
306-
–The project identifier.
306+
–The project identifier
307307
* **`filter`**
308308
(`str | None`, default:
309309
`None`
310310
)
311-
–A filter to apply to the exported data. Defaults to None.
311+
–A filter to apply to the exported data
312312
* **`status`**
313313
(`StatusFilter`, default:
314314
`'completed'`
315315
)
316-
–The status of runs to include. Defaults to "completed".
316+
–The status of runs to include
317317
* **`aggregations`**
318318
(`list[MetricAggregationType] | None`, default:
319319
`None`
320320
)
321-
–A list of aggregation types to apply. Defaults to None.
321+
–A list of aggregation types to apply
322+
* **`format`**
323+
(`ExportFormat`, default:
324+
`'parquet'`
325+
)
326+
–Output format - "parquet", "csv", "json", "jsonl"
327+
* **`base_dir`**
328+
(`str | None`, default:
329+
`None`
330+
)
331+
–Base directory for export (defaults to "./strikes-data")
322332

323333
**Returns:**
324334

325-
* `DataFrame`
326-
–A DataFrame containing the exported run data.
335+
* **`str`** ( `str`
336+
) –Path to the export directory
327337

328338
<Accordion title="Source code in dreadnode/api/client.py" icon="code">
329339
```python
@@ -332,35 +342,61 @@ def export_runs(
332342
project: str,
333343
*,
334344
filter: str | None = None,
335-
# format: ExportFormat = "parquet",
336345
status: StatusFilter = "completed",
337346
aggregations: list[MetricAggregationType] | None = None,
338-
) -> "pd.DataFrame":
347+
format: ExportFormat = "parquet",
348+
base_dir: str | None = None,
349+
) -> str:
339350
"""
340-
Exports run data for a specific project.
351+
Export runs using pagination - always writes to disk.
341352
342353
Args:
343-
project: The project identifier.
344-
filter: A filter to apply to the exported data. Defaults to None.
345-
status: The status of runs to include. Defaults to "completed".
346-
aggregations: A list of aggregation types to apply. Defaults to None.
354+
project: The project identifier
355+
filter: A filter to apply to the exported data
356+
status: The status of runs to include
357+
aggregations: A list of aggregation types to apply
358+
format: Output format - "parquet", "csv", "json", "jsonl"
359+
base_dir: Base directory for export (defaults to "./strikes-data")
347360
348361
Returns:
349-
A DataFrame containing the exported run data.
362+
str: Path to the export directory
350363
"""
351-
import pandas as pd
352364

353-
response = self.request(
365+
logger.info(f"Starting paginated export for project '{project}', format='{format}'")
366+
367+
page = 1
368+
first_response = self.request(
354369
"GET",
355-
f"/strikes/projects/{project!s}/export",
370+
f"/strikes/projects/{project!s}/export/paginated",
356371
params={
357-
"format": "parquet",
372+
"page": page,
358373
"status": status,
359374
**({"filter": filter} if filter else {}),
360375
**({"aggregations": aggregations} if aggregations else {}),
361376
},
362377
)
363-
return pd.read_parquet(io.BytesIO(response.content))
378+
379+
if not first_response.content:
380+
logger.info("No data found")
381+
382+
first_chunk = pd.read_parquet(io.BytesIO(first_response.content))
383+
384+
total_runs = int(first_response.headers.get("x-total", "0"))
385+
has_more = first_response.headers.get("x-has-more", "false") == "true"
386+
387+
logger.info(f"Total runs: {total_runs}, Has more: {has_more}")
388+
389+
logger.info(f"Writing {total_runs} runs to disk")
390+
return self._export_to_disk(
391+
project,
392+
first_chunk,
393+
dict(first_response.headers),
394+
filter,
395+
status,
396+
aggregations,
397+
format,
398+
str(base_dir) if base_dir else None,
399+
)
364400
```
365401

366402

docs/sdk/data_types.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ def to_serializable(self) -> tuple[bytes, dict[str, t.Any]]:
643643
Returns:
644644
A tuple of (video_bytes, metadata_dict)
645645
"""
646-
import numpy as np # type: ignore[import,unused-ignore]
646+
import numpy as np # type: ignore[import,unused-ignore] # noqa: PLC0415
647647

648648
try:
649649
from moviepy.video.VideoClip import ( # type: ignore[import,unused-ignore,import-untyped]

docs/sdk/main.mdx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -972,7 +972,7 @@ with dreadnode.run("my_run"):
972972
def log_metric(
973973
self,
974974
name: str,
975-
value: float | bool | Metric,
975+
value: float | bool | Metric, # noqa: FBT001
976976
*,
977977
step: int = 0,
978978
origin: t.Any | None = None,

docs/sdk/scorers.mdx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1941,7 +1941,7 @@ def zero_shot_classification(
19411941
)
19421942

19431943
try:
1944-
from transformers import ( # type: ignore [attr-defined,import-not-found,unused-ignore]
1944+
from transformers import ( # type: ignore [attr-defined,import-not-found,unused-ignore] # noqa: PLC0415
19451945
pipeline,
19461946
)
19471947
except ImportError:
@@ -2661,7 +2661,7 @@ def detect_harm_with_openai(
26612661
model: The moderation model to use.
26622662
name: Name of the scorer.
26632663
"""
2664-
import openai
2664+
import openai # noqa: PLC0415
26652665

26662666
async def evaluate(
26672667
data: t.Any, *, api_key: str | None = Config(api_key), model: str = Config(model)
@@ -3373,7 +3373,7 @@ def detect_pii_with_presidio(
33733373
)
33743374

33753375
try:
3376-
import presidio_analyzer # type: ignore[import-not-found,unused-ignore] # noqa: F401
3376+
import presidio_analyzer # type: ignore[import-not-found,unused-ignore] # noqa: F401, PLC0415
33773377
except ImportError:
33783378
warn_at_user_stacklevel(presidio_import_error_msg, UserWarning)
33793379

@@ -3589,7 +3589,7 @@ def wrap_chat(
35893589
"""
35903590

35913591
async def evaluate(chat: "Chat") -> Metric:
3592-
from rigging.chat import Chat
3592+
from rigging.chat import Chat # noqa: PLC0415
35933593

35943594
# Fall through to the inner scorer if chat is not a Chat instance
35953595
if not isinstance(chat, Chat):
@@ -4215,7 +4215,7 @@ def similarity_with_litellm(
42154215
or self-hosted models.
42164216
name: Name of the scorer.
42174217
"""
4218-
import litellm
4218+
import litellm # noqa: PLC0415
42194219

42204220
async def evaluate(
42214221
data: t.Any,

0 commit comments

Comments
 (0)