diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 695bf310562..5581d04f57b 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5282,6 +5282,7 @@ def to_sql( name: str, con: Union[str, "sqlalchemy.engine.Connection", "sqlalchemy.engine.Engine", "sqlite3.Connection"], batch_size: Optional[int] = None, + num_proc: Optional[int] = None, **sql_writer_kwargs, ) -> int: """Exports the dataset to a SQL database. @@ -5294,6 +5295,11 @@ def to_sql( batch_size (`int`, *optional*): Size of the batch to load in memory and write at once. Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`. + num_proc (`int`, *optional*): + Number of processes for multiprocessing. By default, it doesn't + use multiprocessing. `batch_size` in this case defaults to + `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default + value if you have sufficient compute power. **sql_writer_kwargs (additional keyword arguments): Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html). @@ -5324,7 +5330,7 @@ def to_sql( # Dynamic import to avoid circular dependency from .io.sql import SqlDatasetWriter - return SqlDatasetWriter(self, name, con, batch_size=batch_size, **sql_writer_kwargs).write() + return SqlDatasetWriter(self, name, con, batch_size=batch_size, num_proc=num_proc, **sql_writer_kwargs).write() def _estimate_nbytes(self) -> int: dataset_nbytes = self.data.nbytes