Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,28 @@ jobs:
with:
python-version: "3.12"

- name: Install dependencies
- name: Install uv
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "$HOME/.local/bin" >> $GITHUB_PATH

- name: Cache uv environment
uses: actions/cache@v3
with:
path: .venv
key: ${{ runner.os }}-uv-${{ hashFiles('pyproject.toml') }}
restore-keys: |
${{ runner.os }}-uv-

- name: Sync dependencies
run: uv sync

- name: Create .env file for testing
run: |
echo "API_KEY=${{ secrets.SYNTHEX_API_KEY }}" > .env

- name: Run tests
run: pytest
run: uv run pytest

publish:
name: Build and Publish to PyPI
Expand Down
13 changes: 12 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
## Release v0.4.1 - September 4, 2025

### Added

- Added a `job_id` field to the response of the `generate_data` method.

### Modified

- Updated the `status` method in `JobsAPI` to accept a `job_id` parameter, allowing users to check the status of any job by its ID.
- Changed the `generate_data` method to initiate data fetching in a separate thread, enabling asynchronous job processing.

## Release v0.4.0 - September 3, 2025

Firt release.
First release.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<p align="center">
<a href="https://github.com/tanaos/synthex">
<img src="assets/banner.png" width="600px" alt="Synthex - Generate high quality, synthetic datasets">
<img src="https://raw.githubusercontent.com/tanaos/synthex/master/assets/banner.png" width="600px" alt="Synthex - Generate high quality, synthetic datasets">
</a>
</p>

Expand Down
62 changes: 18 additions & 44 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,47 +1,21 @@
aiohappyeyeballs==2.6.1
aiohttp==3.12.4
aiosignal==1.3.2
annotated-types==0.7.0
attrs==25.3.0
build==1.2.2.post1
certifi==2025.1.31
charset-normalizer==3.4.1
dill==0.3.8
dnspython==2.7.0
email_validator==2.2.0
filelock==3.18.0
frozenlist==1.6.0
fsspec==2025.3.0
hf-xet==1.1.2
huggingface-hub==0.32.2
anyio==4.10.0
certifi==2025.8.3
charset-normalizer==3.4.3
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.10
iniconfig==2.1.0
multidict==6.4.4
multiprocess==0.70.16
numpy==2.2.6
packaging==24.2
pandas==2.2.3
platformdirs==4.3.8
pluggy==1.5.0
propcache==0.3.1
pyarrow==20.0.0
pydantic==2.11.2
pydantic-settings==2.9.1
pydantic_core==2.33.1
pyproject_hooks==1.2.0
pytest==8.3.5
pytest-mock==3.14.1
python-dateutil==2.9.0.post0
python-dotenv==1.1.0
pytz==2025.2
PyYAML==6.0.2
requests==2.32.3
responses==0.25.7
six==1.17.0
tqdm==4.67.1
typing-inspection==0.4.0
typing_extensions==4.13.1
tzdata==2025.2
urllib3==2.3.0
xxhash==3.5.0
yarl==1.20.0
pydantic==2.11.7
pydantic-core==2.33.2
pydantic-settings==2.10.1
python-dotenv==1.1.1
pyyaml==6.0.2
requests==2.32.4
responses==0.25.8
sniffio==1.3.1
-e file:///home/riccardo/Desktop/tanaos/synthex
typing-extensions==4.14.1
typing-inspection==0.4.1
urllib3==2.5.0
44 changes: 19 additions & 25 deletions synthex/jobs_api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from .api_client import APIClient
from typing import Any, List, Optional
from typing import Any, List
import csv
import os
import time
import threading
import logging

from .models import ListJobsResponseModel, JobOutputType, JobOutputSchemaDefinition, ActionResult, \
from .models import ListJobsResponseModel, JobOutputType, JobOutputSchemaDefinition, GenerateDataResponse, \
JobStatusResponseModel, SuccessResponse
from .endpoints import LIST_JOBS_ENDPOINT, CREATE_JOB_WITH_SAMPLES_ENDPOINT
from .decorators import auto_validate_methods
Expand All @@ -16,7 +17,6 @@

logger = logging.getLogger(__name__)


@auto_validate_methods
class JobsAPI:
"""
Expand All @@ -25,8 +25,6 @@ class JobsAPI:

def __init__(self, client: APIClient):
self._client: APIClient = client
# TODO: add support for multiple concurrent jobs; this should require launching each job in a separate thread.
self._current_job_id: Optional[str] = None

def list(self, limit: int = 10, offset: int = 0) -> ListJobsResponseModel:
"""
Expand Down Expand Up @@ -107,9 +105,6 @@ def _create_job(

response = self._client.post(f"{CREATE_JOB_WITH_SAMPLES_ENDPOINT}", data=data)

# Store id of the current job.
self._current_job_id = response.data

if response.data is None:
raise ValidationError("Response data is None, expected a valid job ID.")
return response.data
Expand Down Expand Up @@ -144,7 +139,6 @@ def _get_data_and_write_to_file(self, job_id: str, output_path: str, output_type
# Keep fetching data as long as the status code is 206, meaning that there is more data;
# When the status code changes to 200, that means there is no more data to fetch.
keep_polling = True
# TODO: a timeout is needed here: if a job has not received any data within x seconds, error out
while keep_polling:
# Poll every JOB_DATA_POLLING_INTERVAL seconds
time.sleep(config.JOB_DATA_POLLING_INTERVAL)
Expand All @@ -167,7 +161,7 @@ def _get_data_and_write_to_file(self, job_id: str, output_path: str, output_type
writer.writeheader()
# Append each dict as a row.
writer.writerows(data)

logger.info(f"{len(data)} datapoints written for job {job_id}")

logger.info(f"All datapoints for job {job_id} have been written")
Expand All @@ -182,7 +176,7 @@ def generate_data(
output_path: str,
number_of_samples: int,
output_type: JobOutputType = "csv",
) -> ActionResult:
) -> GenerateDataResponse:
"""
Generates data based on the provided schema definition, examples, and requirements.
Args:
Expand All @@ -197,7 +191,7 @@ def generate_data(
- "csv": Saves the data to a CSV file.
output_path (str): The file path where the generated data should be saved.
Returns:
ActionResult: An object indicating that the job was started successfully.
GenerateDataResponse: An object indicating that the job was started successfully.
"""

# Sanitize the output path
Expand All @@ -214,25 +208,25 @@ def generate_data(
# Create the output directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# Fetch data and write to file
self._get_data_and_write_to_file(job_id, output_path, output_type)
# Start fetching data in a separate thread
th = threading.Thread(target=self._get_data_and_write_to_file, args=(job_id, output_path, output_type))
th.start()

return ActionResult(
return GenerateDataResponse(
success=True,
message=f"Job started successfully. Output will be saved to '{output_path}' upon completion.",
message=f"Job was started successfully. Output will be saved to '{output_path}'.",
job_id=job_id
)

def status(self) -> JobStatusResponseModel:
def status(self, job_id: str) -> JobStatusResponseModel:
"""
Check the status of the job that is currently running.
Check the status of a job.
Args:
job_id (str): The ID of the job to check the status of.
Returns:
JobStatusResponseModel: A model containing the status of the job.
"""

# No job has been started yet.
if not self._current_job_id:
raise ValidationError("No job is currently running.")

response = self._client.get(GET_JOB_STATUS_ENDPOINT(self._current_job_id))


response = self._client.get(GET_JOB_STATUS_ENDPOINT(job_id))

return JobStatusResponseModel.model_validate(response.data)
5 changes: 3 additions & 2 deletions synthex/models/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class SuccessResponse(BaseModel, Generic[T]):
data: Optional[T] = None


class ActionResult(BaseModel):
class GenerateDataResponse(BaseModel):
success: bool
message: Optional[str] = None
message: str
job_id: str
4 changes: 1 addition & 3 deletions tests/unit/jobs/test_create_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def test_create_job_success(synthex: Synthex, generate_data_params: dict[Any, An
number_of_samples=generate_data_params["number_of_samples"],
)

assert job_id == "abc123"
assert synthex.jobs._current_job_id == "abc123" # type: ignore

assert job_id == "abc123"

@pytest.mark.unit
@responses.activate
Expand Down
11 changes: 4 additions & 7 deletions tests/unit/jobs/test_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,24 @@ def test_status_success(
status_code=200
)

# Artificially set the current job ID to "job_id" for testing purposes.
short_lived_synthex.jobs._current_job_id = job_id # type: ignore

status = short_lived_synthex.jobs.status()
status = short_lived_synthex.jobs.status(job_id)
assert isinstance(status, JobStatusResponseModel), "Status should be of type JobStatusResponseModel"
assert status.status == test_status, f"Status should be {test_status}"
assert status.progress == test_progress, f"Progress should be {test_progress}"


@responses.activate
@pytest.mark.unit
def test_status_no_running_job_failure(
def test_status_no_job_id_failure(
short_lived_synthex: Synthex,
):
"""
This test verifies that a `ValidationError` is raised when attempting to
check the status of jobs through `JobsAPI.status` while no job is currently running. If the exception
check the status of jobs through `JobsAPI.status` without passing a job ID. If the exception
is not raised, the test will fail with an appropriate error message.
Args:
synthex (Synthex): An instance of the `Synthex` class to be tested.
"""

with pytest.raises(ValidationError):
short_lived_synthex.jobs.status()
short_lived_synthex.jobs.status() # type: ignore