Skip to content

Commit 81f238b

Browse files
authored
ENG-1594 : Job failed due to bad user input (#69)
* ENG-1594 : Job failed due to bad user input This PR adds a check to verify that uploaded files are utf-8 compatible. * Open read mode
1 parent 055a916 commit 81f238b

File tree

1 file changed

+14
-0
lines changed

1 file changed

+14
-0
lines changed

src/together/files.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ def check_json(
259259
if not os.path.isfile(file):
260260
report_dict["file_present"] = f"File not found at given file path {file}"
261261
report_dict["is_check_passed"] = False
262+
return report_dict
262263
else:
263264
report_dict["file_present"] = "File found"
264265

@@ -272,9 +273,22 @@ def check_json(
272273
elif file_size == 0:
273274
report_dict["file_size"] = "File is empty"
274275
report_dict["is_check_passed"] = False
276+
return report_dict
275277
else:
276278
report_dict["file_size"] = f"File size {round(file_size / (2**30) ,3)} GB"
277279

280+
# Check that the file is UTF-8 encoded. If not report where the error occurs.
281+
try:
282+
with open(file, "r", encoding="utf-8") as f:
283+
f.read()
284+
except UnicodeDecodeError as e:
285+
report_dict["utf8"] = (
286+
f"File is not UTF-8 encoded. Error raised: {e}."
287+
f"See https://docs.together.ai/docs/fine-tuning for more information."
288+
)
289+
report_dict["is_check_passed"] = False
290+
return report_dict
291+
278292
with open(file) as f:
279293
# idx must be instantiated so decode errors (e.g. file is a tar) or empty files are caught
280294
idx = -1

0 commit comments

Comments
 (0)