12
12
import boto3
13
13
import pandas as pd
14
14
import pyarrow as pa
15
- import pyarrow .lib
16
15
import pyarrow .parquet
17
16
18
17
from awswrangler import _data_types , _utils , exceptions
32
31
_logger : logging .Logger = logging .getLogger (__name__ )
33
32
34
33
34
+ def _pyarrow_parquet_file_wrapper (
35
+ source : Any , read_dictionary : Optional [List [str ]] = None
36
+ ) -> pyarrow .parquet .ParquetFile :
37
+ try :
38
+ return pyarrow .parquet .ParquetFile (source = source , read_dictionary = read_dictionary )
39
+ except pyarrow .ArrowInvalid as ex :
40
+ if str (ex ) == "Parquet file size is 0 bytes" :
41
+ _logger .warning ("Ignoring empty file...xx" )
42
+ return None
43
+ raise
44
+
45
+
35
46
def _read_parquet_metadata_file (
36
47
path : str , boto3_session : boto3 .Session , s3_additional_kwargs : Optional [Dict [str , str ]], use_threads : bool
37
- ) -> Dict [str , str ]:
48
+ ) -> Optional [ Dict [str , str ] ]:
38
49
with open_s3_object (
39
50
path = path ,
40
51
mode = "rb" ,
@@ -43,7 +54,9 @@ def _read_parquet_metadata_file(
43
54
s3_additional_kwargs = s3_additional_kwargs ,
44
55
boto3_session = boto3_session ,
45
56
) as f :
46
- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f )
57
+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (source = f )
58
+ if pq_file is None :
59
+ return None
47
60
return _data_types .athena_types_from_pyarrow_schema (schema = pq_file .schema .to_arrow_schema (), partitions = None )[0 ]
48
61
49
62
@@ -55,7 +68,7 @@ def _read_schemas_from_files(
55
68
s3_additional_kwargs : Optional [Dict [str , str ]],
56
69
) -> Tuple [Dict [str , str ], ...]:
57
70
paths = _utils .list_sampling (lst = paths , sampling = sampling )
58
- schemas : Tuple [Dict [str , str ], ...] = tuple ()
71
+ schemas : Tuple [Optional [ Dict [str , str ] ], ...] = tuple ()
59
72
n_paths : int = len (paths )
60
73
if use_threads is False or n_paths == 1 :
61
74
schemas = tuple (
@@ -76,6 +89,7 @@ def _read_schemas_from_files(
76
89
itertools .repeat (use_threads ),
77
90
)
78
91
)
92
+ schemas = cast (Tuple [Dict [str , str ], ...], tuple (x for x in schemas if x is not None ))
79
93
_logger .debug ("schemas: %s" , schemas )
80
94
return schemas
81
95
@@ -125,6 +139,7 @@ def _read_parquet_metadata(
125
139
path : Union [str , List [str ]],
126
140
path_suffix : Optional [str ],
127
141
path_ignore_suffix : Optional [str ],
142
+ ignore_empty : bool ,
128
143
dtype : Optional [Dict [str , str ]],
129
144
sampling : float ,
130
145
dataset : bool ,
@@ -139,6 +154,7 @@ def _read_parquet_metadata(
139
154
boto3_session = boto3_session ,
140
155
suffix = path_suffix ,
141
156
ignore_suffix = _get_path_ignore_suffix (path_ignore_suffix = path_ignore_suffix ),
157
+ ignore_empty = ignore_empty ,
142
158
)
143
159
144
160
# Files
@@ -279,7 +295,11 @@ def _read_parquet_chunked(
279
295
s3_additional_kwargs = s3_additional_kwargs ,
280
296
boto3_session = boto3_session ,
281
297
) as f :
282
- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
298
+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
299
+ source = f , read_dictionary = categories
300
+ )
301
+ if pq_file is None :
302
+ continue
283
303
schema : Dict [str , str ] = _data_types .athena_types_from_pyarrow_schema (
284
304
schema = pq_file .schema .to_arrow_schema (), partitions = None
285
305
)[0 ]
@@ -342,7 +362,11 @@ def _read_parquet_file(
342
362
s3_additional_kwargs = s3_additional_kwargs ,
343
363
boto3_session = boto3_session ,
344
364
) as f :
345
- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
365
+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
366
+ source = f , read_dictionary = categories
367
+ )
368
+ if pq_file is None :
369
+ raise exceptions .InvalidFile (f"Invalid Parquet file: { path } " )
346
370
return pq_file .read (columns = columns , use_threads = False , use_pandas_metadata = False )
347
371
348
372
@@ -362,7 +386,11 @@ def _count_row_groups(
362
386
s3_additional_kwargs = s3_additional_kwargs ,
363
387
boto3_session = boto3_session ,
364
388
) as f :
365
- pq_file : pyarrow .parquet .ParquetFile = pyarrow .parquet .ParquetFile (source = f , read_dictionary = categories )
389
+ pq_file : Optional [pyarrow .parquet .ParquetFile ] = _pyarrow_parquet_file_wrapper (
390
+ source = f , read_dictionary = categories
391
+ )
392
+ if pq_file is None :
393
+ return 0
366
394
n : int = cast (int , pq_file .num_row_groups )
367
395
_logger .debug ("Row groups count: %d" , n )
368
396
return n
@@ -401,6 +429,7 @@ def read_parquet(
401
429
path : Union [str , List [str ]],
402
430
path_suffix : Union [str , List [str ], None ] = None ,
403
431
path_ignore_suffix : Union [str , List [str ], None ] = None ,
432
+ ignore_empty : bool = True ,
404
433
partition_filter : Optional [Callable [[Dict [str , str ]], bool ]] = None ,
405
434
columns : Optional [List [str ]] = None ,
406
435
validate_schema : bool = False ,
@@ -453,9 +482,13 @@ def read_parquet(
453
482
S3 prefix (accepts Unix shell-style wildcards)
454
483
(e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
455
484
path_suffix: Union[str, List[str], None]
456
- Suffix or List of suffixes for filtering S3 keys.
485
+ Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
486
+ If None, will try to read all files. (default)
457
487
path_ignore_suffix: Union[str, List[str], None]
458
- Suffix or List of suffixes for S3 keys to be ignored.
488
+ Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
489
+ If None, will try to read all files. (default)
490
+ ignore_empty: bool
491
+ Ignore files with 0 bytes.
459
492
partition_filter: Optional[Callable[[Dict[str, str]], bool]]
460
493
Callback Function filters to apply on PARTITION columns (PUSH-DOWN filter).
461
494
This function MUST receive a single argument (Dict[str, str]) where keys are partitions
@@ -543,6 +576,7 @@ def read_parquet(
543
576
ignore_suffix = _get_path_ignore_suffix (path_ignore_suffix = path_ignore_suffix ),
544
577
last_modified_begin = last_modified_begin ,
545
578
last_modified_end = last_modified_end ,
579
+ ignore_empty = ignore_empty ,
546
580
)
547
581
path_root : Optional [str ] = _get_path_root (path = path , dataset = dataset )
548
582
if path_root is not None :
@@ -727,6 +761,7 @@ def read_parquet_metadata(
727
761
path : Union [str , List [str ]],
728
762
path_suffix : Optional [str ] = None ,
729
763
path_ignore_suffix : Optional [str ] = None ,
764
+ ignore_empty : bool = True ,
730
765
dtype : Optional [Dict [str , str ]] = None ,
731
766
sampling : float = 1.0 ,
732
767
dataset : bool = False ,
@@ -754,9 +789,13 @@ def read_parquet_metadata(
754
789
S3 prefix (accepts Unix shell-style wildcards)
755
790
(e.g. s3://bucket/prefix) or list of S3 objects paths (e.g. [s3://bucket/key0, s3://bucket/key1]).
756
791
path_suffix: Union[str, List[str], None]
757
- Suffix or List of suffixes for filtering S3 keys.
792
+ Suffix or List of suffixes to be read (e.g. [".gz.parquet", ".snappy.parquet"]).
793
+ If None, will try to read all files. (default)
758
794
path_ignore_suffix: Union[str, List[str], None]
759
- Suffix or List of suffixes for S3 keys to be ignored.
795
+ Suffix or List of suffixes for S3 keys to be ignored.(e.g. [".csv", "_SUCCESS"]).
796
+ If None, will try to read all files. (default)
797
+ ignore_empty: bool
798
+ Ignore files with 0 bytes.
760
799
dtype : Dict[str, str], optional
761
800
Dictionary of columns names and Athena/Glue types to be casted.
762
801
Useful when you have columns with undetermined data types as partitions columns.
@@ -804,6 +843,7 @@ def read_parquet_metadata(
804
843
path = path ,
805
844
path_suffix = path_suffix ,
806
845
path_ignore_suffix = path_ignore_suffix ,
846
+ ignore_empty = ignore_empty ,
807
847
dtype = dtype ,
808
848
sampling = sampling ,
809
849
dataset = dataset ,
0 commit comments