Skip to content

Commit 6a3dc7e

Browse files
authored
Add check for unneeded jsonl columns
2 parents 49b8c28 + 784fd4f commit 6a3dc7e

File tree

3 files changed

+26
-2
lines changed

3 files changed

+26
-2
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ build-backend = "poetry.masonry.api"
1212

1313
[tool.poetry]
1414
name = "together"
15-
version = "1.3.6"
15+
version = "1.3.7"
1616
authors = [
1717
"Together AI <[email protected]>"
1818
]

src/together/utils/files.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,8 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
120120
raise InvalidFileFormatError(
121121
message=(
122122
f"Error parsing file. Invalid format on line {idx + 1} of the input file. "
123-
'Example of valid json: {"text": "my sample string"}. '
123+
"Datasets must follow text, conversational, or instruction format. For more"
124+
"information, see https://docs.together.ai/docs/fine-tuning-data-preparation"
124125
),
125126
line_number=idx + 1,
126127
error_source="line_type",
@@ -142,6 +143,18 @@ def _check_jsonl(file: Path) -> Dict[str, Any]:
142143
error_source="format",
143144
)
144145

146+
# Check that there are no extra columns
147+
for column in json_line:
148+
if (
149+
column
150+
not in JSONL_REQUIRED_COLUMNS_MAP[possible_format]
151+
):
152+
raise InvalidFileFormatError(
153+
message=f'Found extra column "{column}" in the line {idx + 1}.',
154+
line_number=idx + 1,
155+
error_source="format",
156+
)
157+
145158
if current_format is None:
146159
raise InvalidFileFormatError(
147160
message=(

tests/unit/test_files_checks.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,3 +279,14 @@ def test_check_jsonl_wrong_turn_type(tmp_path: Path):
279279
"Invalid format on line 1 of the input file. Expected a dictionary"
280280
in report["message"]
281281
)
282+
283+
284+
def test_check_jsonl_extra_column(tmp_path: Path):
285+
file = tmp_path / "extra_column.jsonl"
286+
content = [{"text": "Hello, world!", "extra_column": "extra"}]
287+
with file.open("w") as f:
288+
f.write("\n".join(json.dumps(item) for item in content))
289+
290+
report = check_file(file)
291+
assert not report["is_check_passed"]
292+
assert "Found extra column" in report["message"]

0 commit comments

Comments
 (0)