Skip to content

Commit cc28b1d

Browse files
authored
Hotfix/arff (#1388)
* Allow skipping parquet download through environment variable * Allow skip of parquet file, fix bug if no pq file is returned * Declare the environment file in config.py
1 parent a4fb848 commit cc28b1d

File tree

3 files changed

+15
-6
lines changed

3 files changed

+15
-6
lines changed

openml/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
file_handler: logging.handlers.RotatingFileHandler | None = None
2424

2525
OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
26+
OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
2627

2728

2829
class _Config(TypedDict):

openml/datasets/dataset.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import gzip
55
import logging
6+
import os
67
import pickle
78
import re
89
import warnings
@@ -17,6 +18,7 @@
1718
import xmltodict
1819

1920
from openml.base import OpenMLBase
21+
from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
2022
from openml.exceptions import PyOpenMLError
2123

2224
from .data_feature import OpenMLDataFeature
@@ -358,8 +360,10 @@ def _download_data(self) -> None:
358360
# import required here to avoid circular import.
359361
from .functions import _get_dataset_arff, _get_dataset_parquet
360362

361-
if self._parquet_url is not None:
362-
self.parquet_file = str(_get_dataset_parquet(self))
363+
skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
364+
if self._parquet_url is not None and not skip_parquet:
365+
parquet_file = _get_dataset_parquet(self)
366+
self.parquet_file = None if parquet_file is None else str(parquet_file)
363367
if self.parquet_file is None:
364368
self.data_file = str(_get_dataset_arff(self))
365369

openml/datasets/functions.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
import logging
6+
import os
67
import warnings
78
from collections import OrderedDict
89
from pathlib import Path
@@ -20,6 +21,7 @@
2021

2122
import openml._api_calls
2223
import openml.utils
24+
from openml.config import OPENML_SKIP_PARQUET_ENV_VAR
2325
from openml.exceptions import (
2426
OpenMLHashException,
2527
OpenMLPrivateDatasetError,
@@ -560,20 +562,22 @@ def get_dataset( # noqa: C901, PLR0912
560562
if download_qualities:
561563
qualities_file = _get_dataset_qualities_file(did_cache_dir, dataset_id)
562564

563-
if "oml:parquet_url" in description and download_data:
565+
parquet_file = None
566+
skip_parquet = os.environ.get(OPENML_SKIP_PARQUET_ENV_VAR, "false").casefold() == "true"
567+
download_parquet = "oml:parquet_url" in description and not skip_parquet
568+
if download_parquet and (download_data or download_all_files):
564569
try:
565570
parquet_file = _get_dataset_parquet(
566571
description,
567572
download_all_files=download_all_files,
568573
)
569574
except urllib3.exceptions.MaxRetryError:
570575
parquet_file = None
571-
else:
572-
parquet_file = None
573576

574577
arff_file = None
575578
if parquet_file is None and download_data:
576-
logger.warning("Failed to download parquet, fallback on ARFF.")
579+
if download_parquet:
580+
logger.warning("Failed to download parquet, fallback on ARFF.")
577581
arff_file = _get_dataset_arff(description)
578582

579583
remove_dataset_cache = False

0 commit comments

Comments
 (0)