Skip to content

Commit d8e2dfb

Browse files
authored
Merge pull request #867 from sillsdev/collect_verses
Add small and missing file reporting
2 parents aa98c59 + 8512550 commit d8e2dfb

File tree

1 file changed

+31
-6
lines changed

1 file changed

+31
-6
lines changed

silnlp/common/collect_verse_counts.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,14 @@ def collect_verse_counts(
8282
output_path = output_folder if isinstance(output_folder, Path) else Path(output_folder)
8383

8484
extract_files = set()
85+
missing_files = []
8586
for file_pattern in file_patterns.split(";"):
8687
file_pattern = file_pattern.strip()
87-
extract_files.update(input_path.glob(file_pattern))
88+
matched = list(input_path.glob(file_pattern))
89+
if not matched:
90+
missing_files.append(input_path / file_pattern)
91+
extract_files.update(matched)
92+
8893
project_names = [f.stem for f in extract_files]
8994
projects_to_process = project_names
9095

@@ -117,13 +122,18 @@ def collect_verse_counts(
117122

118123
# Get counts for unprocessed files
119124
complete_verse_counts = get_complete_verse_counts()
125+
small_files = []
120126
partially_complete_projects = []
121-
for extract_file_name in tqdm(extract_files):
127+
for extract_file_name in tqdm(sorted(extract_files)):
122128
project_name = extract_file_name.stem
129+
# LOGGER.info(f"Processing {project_name}")
130+
if extract_file_name.stat().st_size < 41_000:
131+
small_files.append(extract_file_name)
132+
LOGGER.info(f"Small file found for {extract_file_name}")
133+
continue
123134
if project_name not in projects_to_process:
124-
LOGGER.info(f"Found verse counts for {project_name}")
135+
LOGGER.info(f"Found verse counts for {extract_file_name}")
125136
continue
126-
LOGGER.info(f"Processing {project_name}")
127137

128138
verse_counts = defaultdict(list)
129139
with (
@@ -236,13 +246,27 @@ def collect_verse_counts(
236246
partial_books_out_path.unlink(missing_ok=True)
237247
else:
238248
df.to_csv(partial_books_out_path)
249+
print("\n")
250+
if small_files:
251+
LOGGER.warning(f"Skipped {len(small_files)} files smaller than 41KB:")
252+
for f in small_files:
253+
LOGGER.warning(f" {f.name}")
254+
else:
255+
LOGGER.info("No files smaller than 41KB were found.")
256+
if missing_files:
257+
LOGGER.warning(f"Skipped {len(missing_files)} missing files:")
258+
for f in missing_files:
259+
LOGGER.warning(f" {f.name}")
260+
else:
261+
LOGGER.info("All files were found.")
239262

240263

241264
def main() -> None:
242265
parser = argparse.ArgumentParser(description="Collect various counts from a corpus of Bible extracts")
243266
parser.add_argument(
244267
"folder",
245-
help="An experiment folder (typically in MT/experiments) that contains a config.yml file. The results will be saved in this folder.",
268+
help="An experiment folder (typically in MT/experiments) that contains a config.yml file."
269+
" The results will be saved in this folder.",
246270
)
247271
parser.add_argument(
248272
"--input-folder", default=SIL_NLP_ENV.mt_scripture_dir, help="Folder with corpus of Bible extract files."
@@ -273,7 +297,8 @@ def main() -> None:
273297
if file_patterns == "":
274298
if not folder.exists():
275299
LOGGER.error(
276-
f"Folder {folder} does not exist. Please provide an experiment folder, typically in MT/experiments, containing a config.yml file or a list of files with the --files argument."
300+
f"Folder {folder} does not exist. Please provide an experiment folder, typically in MT/experiments,"
301+
" containing a config.yml file or a list of files with the --files argument."
277302
)
278303
return
279304
else:

0 commit comments

Comments
 (0)