From 7f77f5642ae362b6cb14f904e3e40550bbebb0b9 Mon Sep 17 00:00:00 2001 From: Ryo Kitagawa Date: Fri, 14 Mar 2025 11:56:22 +0900 Subject: [PATCH] fix: add lines parameter to JsonFileProcessor --- gokart/file_processor.py | 14 +++++++++----- test/test_file_processor.py | 16 +++++++++------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gokart/file_processor.py b/gokart/file_processor.py index b7b7d2d7..735d9c53 100644 --- a/gokart/file_processor.py +++ b/gokart/file_processor.py @@ -3,7 +3,7 @@ from abc import abstractmethod from io import BytesIO from logging import getLogger -from typing import Optional +from typing import Literal, Optional import dill import luigi @@ -156,16 +156,20 @@ def dump(self, obj, file): file.write(str(obj).encode()) +JsonOrient = Literal['split', 'records', 'index', 'table', 'columns', 'values'] + + class JsonFileProcessor(FileProcessor): - def __init__(self, orient: Optional[str] = None): + def __init__(self, orient: Optional[JsonOrient] = None, lines: bool = False): self._orient = orient + self._lines = lines def format(self): return luigi.format.Nop def load(self, file): try: - return pd.read_json(file, orient=self._orient, lines=True if self._orient == 'records' else False) + return pd.read_json(file, orient=self._orient, lines=self._lines) except pd.errors.EmptyDataError: return pd.DataFrame() @@ -175,7 +179,7 @@ def dump(self, obj, file): ) if isinstance(obj, dict): obj = pd.DataFrame.from_dict(obj) - obj.to_json(file, orient=self._orient, lines=True if self._orient == 'records' else False) + obj.to_json(file, orient=self._orient, lines=self._lines) class XmlFileProcessor(FileProcessor): @@ -289,7 +293,7 @@ def make_file_processor(file_path: str, store_index_in_feather: bool) -> FilePro '.pkl': PickleFileProcessor(), '.gz': GzipFileProcessor(), '.json': JsonFileProcessor(), - '.ndjson': JsonFileProcessor(orient='records'), + '.ndjson': JsonFileProcessor(orient='records', lines=True), '.xml': XmlFileProcessor(), '.npz': NpzFileProcessor(), '.parquet': ParquetFileProcessor(compression='gzip'), diff --git a/test/test_file_processor.py b/test/test_file_processor.py index 38545a3f..d35935bf 100644 --- a/test/test_file_processor.py +++ b/test/test_file_processor.py @@ -85,28 +85,30 @@ def test_load_csv_with_cp932(self): class TestJsonFileProcessor: @pytest.mark.parametrize( - 'orient,input_data,expected_json', + 'orient,lines,input_data,expected_json', [ pytest.param( None, + False, pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), '{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}', id='With Default Orient for DataFrame', ), pytest.param( 'records', + True, pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}), '{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n', id='With Records Orient for DataFrame', ), - pytest.param(None, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}', id='With Default Orient for Dict'), - pytest.param('records', {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n', id='With Records Orient for Dict'), - pytest.param(None, {}, '{}', id='With Default Orient for Empty Dict'), - pytest.param('records', {}, '\n', id='With Records Orient for Empty Dict'), + pytest.param(None, False, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":{"0":1,"1":2,"2":3},"B":{"0":4,"1":5,"2":6}}', id='With Default Orient for Dict'), + pytest.param('records', True, {'A': [1, 2, 3], 'B': [4, 5, 6]}, '{"A":1,"B":4}\n{"A":2,"B":5}\n{"A":3,"B":6}\n', id='With Records Orient for Dict'), + pytest.param(None, False, {}, '{}', id='With Default Orient for Empty Dict'), + pytest.param('records', True, {}, '\n', id='With Records Orient for Empty Dict'), ], ) - def test_dump_and_load_json(self, orient, input_data, expected_json): - processor = JsonFileProcessor(orient=orient) + def test_dump_and_load_json(self, orient, lines, input_data, expected_json): + processor = JsonFileProcessor(orient=orient, lines=lines) with tempfile.TemporaryDirectory() as temp_dir: temp_path = f'{temp_dir}/temp.json'