10
10
# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26
11
11
from frozendict .core import frozendict
12
12
from pathlib import Path
13
- from typing import TextIO , Optional as O , Union , Any
13
+ from typing import Optional as O , Union , Any
14
14
15
15
from staging_service .import_specifications .file_parser import (
16
16
PRIMITIVE_TYPE ,
35
35
_HEADER_REGEX = re .compile (f"{ _DATA_TYPE } (\\ w+){ _HEADER_SEP } "
36
36
+ f"{ _COLUMN_STR } (\\ d+){ _HEADER_SEP } { _VERSION_STR } (\\ d+)" )
37
37
38
- _MAGIC_TEXT_FILES = {"text/plain" , "inode/x-empty" }
38
+ _MAGIC_TEXT_FILES = {"text/plain" , "inode/x-empty" , "application/csv" , "text/csv" }
39
39
40
40
41
41
class _ParseException (Exception ):
@@ -63,26 +63,18 @@ def _parse_header(header: str, spec_source: SpecificationSource, maximum_version
63
63
return match [1 ], int (match [2 ])
64
64
65
65
66
- def _required_next (
67
- input_ : Union [TextIO , Any ], # Any really means a csv reader object
68
- spec_source : SpecificationSource ,
69
- error : str
70
- ) -> Union [str , list [str ]]:
71
- # returns a string for a TextIO input or a list for a Reader input
72
- try :
73
- return next (input_ )
74
- except StopIteration :
75
- raise _ParseException (Error (ErrorType .PARSE_FAIL , error , spec_source ))
76
-
77
66
def _csv_next (
78
- input_ : Union [ TextIO , Any ] , # Any really means a csv reader object
67
+ input_ : Any , # Any really means a csv reader object
79
68
line_number : int ,
80
- expected_line_count : int ,
69
+ expected_line_count : Union [ None , int ], # None = skip columns check
81
70
spec_source : SpecificationSource ,
82
71
error : str
83
72
) -> list [str ]:
84
- line = _required_next (input_ , spec_source , error )
85
- if len (line ) != expected_line_count :
73
+ try :
74
+ line = next (input_ )
75
+ except StopIteration :
76
+ raise _ParseException (Error (ErrorType .PARSE_FAIL , error , spec_source ))
77
+ if expected_line_count and len (line ) != expected_line_count :
86
78
raise _ParseException (Error (
87
79
ErrorType .INCORRECT_COLUMN_COUNT ,
88
80
f"Incorrect number of items in line { line_number } , "
@@ -91,15 +83,6 @@ def _csv_next(
91
83
return line
92
84
93
85
94
- def _get_datatype (input_ : TextIO , spec_source : SpecificationSource , maximum_version : int
95
- ) -> tuple [str , int ]:
96
- # return is (data type, column count)
97
- return _parse_header (
98
- _required_next (input_ , spec_source , "Missing data type / version header" ).strip (),
99
- spec_source ,
100
- maximum_version )
101
-
102
-
103
86
def _error (error : Error ) -> ParseResults :
104
87
return ParseResults (errors = tuple ([error ]))
105
88
@@ -155,11 +138,13 @@ def _normalize_headers(
155
138
def _parse_xsv (path : Path , sep : str ) -> ParseResults :
156
139
spcsrc = SpecificationSource (path )
157
140
try :
158
- if magic .from_file (str (path ), mime = True ) not in _MAGIC_TEXT_FILES :
159
- return _error (Error (ErrorType .PARSE_FAIL , "Not a text file" , spcsrc ))
141
+ filetype = magic .from_file (str (path ), mime = True )
142
+ if filetype not in _MAGIC_TEXT_FILES :
143
+ return _error (Error (ErrorType .PARSE_FAIL , "Not a text file: " + filetype , spcsrc ))
160
144
with open (path , newline = '' ) as input_ :
161
- datatype , columns = _get_datatype (input_ , spcsrc , _VERSION )
162
145
rdr = csv .reader (input_ , delimiter = sep ) # let parser handle quoting
146
+ dthd = _csv_next (rdr , 1 , None , spcsrc , "Missing data type / version header" )
147
+ datatype , columns = _parse_header (dthd [0 ], spcsrc , _VERSION )
163
148
hd1 = _csv_next (rdr , 2 , columns , spcsrc , "Missing 2nd header line" )
164
149
param_ids = _normalize_headers (hd1 , 2 , spcsrc )
165
150
_csv_next (rdr , 3 , columns , spcsrc , "Missing 3rd header line" )
0 commit comments