File tree 3 files changed +15
-6
lines changed
3 files changed +15
-6
lines changed Original file line number Diff line number Diff line change 23
23
file_handler : logging .handlers .RotatingFileHandler | None = None
24
24
25
25
OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
26
+ OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
26
27
27
28
28
29
class _Config (TypedDict ):
Original file line number Diff line number Diff line change 3
3
4
4
import gzip
5
5
import logging
6
+ import os
6
7
import pickle
7
8
import re
8
9
import warnings
17
18
import xmltodict
18
19
19
20
from openml .base import OpenMLBase
21
+ from openml .config import OPENML_SKIP_PARQUET_ENV_VAR
20
22
from openml .exceptions import PyOpenMLError
21
23
22
24
from .data_feature import OpenMLDataFeature
@@ -358,8 +360,10 @@ def _download_data(self) -> None:
358
360
# import required here to avoid circular import.
359
361
from .functions import _get_dataset_arff , _get_dataset_parquet
360
362
361
- if self ._parquet_url is not None :
362
- self .parquet_file = str (_get_dataset_parquet (self ))
363
+ skip_parquet = os .environ .get (OPENML_SKIP_PARQUET_ENV_VAR , "false" ).casefold () == "true"
364
+ if self ._parquet_url is not None and not skip_parquet :
365
+ parquet_file = _get_dataset_parquet (self )
366
+ self .parquet_file = None if parquet_file is None else str (parquet_file )
363
367
if self .parquet_file is None :
364
368
self .data_file = str (_get_dataset_arff (self ))
365
369
Original file line number Diff line number Diff line change 3
3
from __future__ import annotations
4
4
5
5
import logging
6
+ import os
6
7
import warnings
7
8
from collections import OrderedDict
8
9
from pathlib import Path
20
21
21
22
import openml ._api_calls
22
23
import openml .utils
24
+ from openml .config import OPENML_SKIP_PARQUET_ENV_VAR
23
25
from openml .exceptions import (
24
26
OpenMLHashException ,
25
27
OpenMLPrivateDatasetError ,
@@ -560,20 +562,22 @@ def get_dataset( # noqa: C901, PLR0912
560
562
if download_qualities :
561
563
qualities_file = _get_dataset_qualities_file (did_cache_dir , dataset_id )
562
564
563
- if "oml:parquet_url" in description and download_data :
565
+ parquet_file = None
566
+ skip_parquet = os .environ .get (OPENML_SKIP_PARQUET_ENV_VAR , "false" ).casefold () == "true"
567
+ download_parquet = "oml:parquet_url" in description and not skip_parquet
568
+ if download_parquet and (download_data or download_all_files ):
564
569
try :
565
570
parquet_file = _get_dataset_parquet (
566
571
description ,
567
572
download_all_files = download_all_files ,
568
573
)
569
574
except urllib3 .exceptions .MaxRetryError :
570
575
parquet_file = None
571
- else :
572
- parquet_file = None
573
576
574
577
arff_file = None
575
578
if parquet_file is None and download_data :
576
- logger .warning ("Failed to download parquet, fallback on ARFF." )
579
+ if download_parquet :
580
+ logger .warning ("Failed to download parquet, fallback on ARFF." )
577
581
arff_file = _get_dataset_arff (description )
578
582
579
583
remove_dataset_cache = False
You can’t perform that action at this time.
0 commit comments