Merge pull request #151 from dreadnode/users/raja/update-export-runs-to-paginated

rdheekonda · web-flow · commit 1ef3266ca70c · 2025-09-02T11:41:55.000-07:00
feat: Update export_runs to use paginated API with disk-based exports
diff --git a/docs/sdk/api.mdx b/docs/sdk/api.mdx
@@ -170,7 +170,6 @@ def export_metrics(
     Returns:
         A DataFrame containing the exported metric data.
     """
-    import pandas as pd
 
     response = self.request(
         "GET",
@@ -265,7 +264,6 @@ def export_parameters(
     Returns:
         A DataFrame containing the exported parameter data.
     """
-    import pandas as pd
 
     response = self.request(
         "GET",
@@ -294,36 +292,48 @@ export_runs(
     filter: str | None = None,
     status: StatusFilter = "completed",
     aggregations: list[MetricAggregationType] | None = None,
-) -> pd.DataFrame
+    format: ExportFormat = "parquet",
+    base_dir: str | None = None,
+) -> str
 ```
 
-Exports run data for a specific project.
+Export runs using pagination - always writes to disk.
 
 **Parameters:**
 
 * **`project`**
   (`str`)
-  –The project identifier.
+  –The project identifier
 * **`filter`**
   (`str | None`, default:
   `None`
   )
-  –A filter to apply to the exported data. Defaults to None.
+  –A filter to apply to the exported data
 * **`status`**
   (`StatusFilter`, default:
   `'completed'`
   )
-  –The status of runs to include. Defaults to "completed".
+  –The status of runs to include
 * **`aggregations`**
   (`list[MetricAggregationType] | None`, default:
   `None`
   )
-  –A list of aggregation types to apply. Defaults to None.
+  –A list of aggregation types to apply
+* **`format`**
+  (`ExportFormat`, default:
+  `'parquet'`
+  )
+  –Output format - "parquet", "csv", "json", "jsonl"
+* **`base_dir`**
+  (`str | None`, default:
+  `None`
+  )
+  –Base directory for export (defaults to "./strikes-data")
 
 **Returns:**
 
-* `DataFrame`
-  –A DataFrame containing the exported run data.
+* **`str`** ( `str`
+  ) –Path to the export directory
 
 <Accordion title="Source code in dreadnode/api/client.py" icon="code">
 ```python
@@ -332,35 +342,61 @@ def export_runs(
     project: str,
     *,
     filter: str | None = None,
-    # format: ExportFormat = "parquet",
     status: StatusFilter = "completed",
     aggregations: list[MetricAggregationType] | None = None,
-) -> "pd.DataFrame":
+    format: ExportFormat = "parquet",
+    base_dir: str | None = None,
+) -> str:
     """
-    Exports run data for a specific project.
+    Export runs using pagination - always writes to disk.
 
     Args:
-        project: The project identifier.
-        filter: A filter to apply to the exported data. Defaults to None.
-        status: The status of runs to include. Defaults to "completed".
-        aggregations: A list of aggregation types to apply. Defaults to None.
+        project: The project identifier
+        filter: A filter to apply to the exported data
+        status: The status of runs to include
+        aggregations: A list of aggregation types to apply
+        format: Output format - "parquet", "csv", "json", "jsonl"
+        base_dir: Base directory for export (defaults to "./strikes-data")
 
     Returns:
-        A DataFrame containing the exported run data.
+        str: Path to the export directory
     """
-    import pandas as pd
 
-    response = self.request(
+    logger.info(f"Starting paginated export for project '{project}', format='{format}'")
+
+    page = 1
+    first_response = self.request(
         "GET",
-        f"/strikes/projects/{project!s}/export",
+        f"/strikes/projects/{project!s}/export/paginated",
         params={
-            "format": "parquet",
+            "page": page,
             "status": status,
             **({"filter": filter} if filter else {}),
             **({"aggregations": aggregations} if aggregations else {}),
         },
     )
-    return pd.read_parquet(io.BytesIO(response.content))
+
+    if not first_response.content:
+        logger.info("No data found")
+
+    first_chunk = pd.read_parquet(io.BytesIO(first_response.content))
+
+    total_runs = int(first_response.headers.get("x-total", "0"))
+    has_more = first_response.headers.get("x-has-more", "false") == "true"
+
+    logger.info(f"Total runs: {total_runs}, Has more: {has_more}")
+
+    logger.info(f"Writing {total_runs} runs to disk")
+    return self._export_to_disk(
+        project,
+        first_chunk,
+        dict(first_response.headers),
+        filter,
+        status,
+        aggregations,
+        format,
+        str(base_dir) if base_dir else None,
+    )
 ```
 
 
diff --git a/docs/sdk/data_types.mdx b/docs/sdk/data_types.mdx
@@ -643,7 +643,7 @@ def to_serializable(self) -> tuple[bytes, dict[str, t.Any]]:
     Returns:
         A tuple of (video_bytes, metadata_dict)
     """
-    import numpy as np  # type: ignore[import,unused-ignore]
+    import numpy as np  # type: ignore[import,unused-ignore]  # noqa: PLC0415
 
     try:
         from moviepy.video.VideoClip import (  # type: ignore[import,unused-ignore,import-untyped]
diff --git a/docs/sdk/main.mdx b/docs/sdk/main.mdx
@@ -972,7 +972,7 @@ with dreadnode.run("my_run"):
 def log_metric(
     self,
     name: str,
-    value: float | bool | Metric,
+    value: float | bool | Metric,  # noqa: FBT001
     *,
     step: int = 0,
     origin: t.Any | None = None,
diff --git a/docs/sdk/scorers.mdx b/docs/sdk/scorers.mdx
@@ -1941,7 +1941,7 @@ def zero_shot_classification(
     )
 
     try:
-        from transformers import (  # type: ignore [attr-defined,import-not-found,unused-ignore]
+        from transformers import (  # type: ignore [attr-defined,import-not-found,unused-ignore]  # noqa: PLC0415
             pipeline,
         )
     except ImportError:
@@ -2661,7 +2661,7 @@ def detect_harm_with_openai(
         model: The moderation model to use.
         name: Name of the scorer.
     """
-    import openai
+    import openai  # noqa: PLC0415
 
     async def evaluate(
         data: t.Any, *, api_key: str | None = Config(api_key), model: str = Config(model)
@@ -3373,7 +3373,7 @@ def detect_pii_with_presidio(
     )
 
     try:
-        import presidio_analyzer  # type: ignore[import-not-found,unused-ignore]  # noqa: F401
+        import presidio_analyzer  # type: ignore[import-not-found,unused-ignore]  # noqa: F401, PLC0415
     except ImportError:
         warn_at_user_stacklevel(presidio_import_error_msg, UserWarning)
 
@@ -3589,7 +3589,7 @@ def wrap_chat(
     """
 
     async def evaluate(chat: "Chat") -> Metric:
-        from rigging.chat import Chat
+        from rigging.chat import Chat  # noqa: PLC0415
 
         # Fall through to the inner scorer if chat is not a Chat instance
         if not isinstance(chat, Chat):
@@ -4215,7 +4215,7 @@ def similarity_with_litellm(
                   or self-hosted models.
         name: Name of the scorer.
     """
-    import litellm
+    import litellm  # noqa: PLC0415
 
     async def evaluate(
         data: t.Any,
diff --git a/docs/usage/export.mdx b/docs/usage/export.mdx
diff --git a/dreadnode/api/client.py b/dreadnode/api/client.py