Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions gokart/file_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from abc import abstractmethod
from io import BytesIO
from logging import getLogger
from typing import Optional
from typing import Literal, Optional

import dill
import luigi
Expand Down Expand Up @@ -156,16 +156,20 @@ def dump(self, obj, file):
file.write(str(obj).encode())


JsonOrient = Literal['split', 'records', 'index', 'table', 'columns', 'values']


class JsonFileProcessor(FileProcessor):
def __init__(self, orient: Optional[str] = None):
def __init__(self, orient: Optional[JsonOrient] = None, lines: bool = False):
self._orient = orient
self._lines = lines

def format(self):
return luigi.format.Nop

def load(self, file):
try:
return pd.read_json(file, orient=self._orient, lines=True if self._orient == 'records' else False)
return pd.read_json(file, orient=self._orient, lines=self._lines)
except pd.errors.EmptyDataError:
return pd.DataFrame()

Expand All @@ -175,7 +179,7 @@ def dump(self, obj, file):
)
if isinstance(obj, dict):
obj = pd.DataFrame.from_dict(obj)
obj.to_json(file, orient=self._orient, lines=True if self._orient == 'records' else False)
obj.to_json(file, orient=self._orient, lines=self._lines)


class XmlFileProcessor(FileProcessor):
Expand Down Expand Up @@ -289,7 +293,7 @@ def make_file_processor(file_path: str, store_index_in_feather: bool) -> FilePro
'.pkl': PickleFileProcessor(),
'.gz': GzipFileProcessor(),
'.json': JsonFileProcessor(),
'.ndjson': JsonFileProcessor(orient='records'),
'.ndjson': JsonFileProcessor(orient='records', lines=True),
'.xml': XmlFileProcessor(),
'.npz': NpzFileProcessor(),
'.parquet': ParquetFileProcessor(compression='gzip'),
Expand Down
16 changes: 9 additions & 7 deletions test/test_file_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,28 +85,30 @@ def test_load_csv_with_cp932(self):

class TestJsonFileProcessor:
@pytest.mark.parametrize(
'orient,input_data,expected_json',
'orient,lines,input_data,expected_json',
[
pytest.param(
None,
False,
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}),
'{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}',
id='With Default Orient for DataFrame',
),
pytest.param(
'records',
True,
pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}),
'{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n',
id='With Records Orient for DataFrame',
),
pytest.param(None, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}', id='With Default Orient for Dict'),
pytest.param('records', {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n', id='With Records Orient for Dict'),
pytest.param(None, {}, '{}', id='With Default Orient for Empty Dict'),
pytest.param('records', {}, '\n', id='With Records Orient for Empty Dict'),
pytest.param(None, False, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}', id='With Default Orient for Dict'),
pytest.param('records', True, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n', id='With Records Orient for Dict'),
pytest.param(None, False, {}, '{}', id='With Default Orient for Empty Dict'),
pytest.param('records', True, {}, '\n', id='With Records Orient for Empty Dict'),
],
)
def test_dump_and_load_json(self, orient, input_data, expected_json):
processor = JsonFileProcessor(orient=orient)
def test_dump_and_load_json(self, orient, lines, input_data, expected_json):
processor = JsonFileProcessor(orient=orient, lines=lines)

with tempfile.TemporaryDirectory() as temp_dir:
temp_path = f'{temp_dir}/temp.json'
Expand Down