sillsdev · davidbaines · Jun 18, 2025 · Aug 13, 2025 · Sep 25, 2025 · Oct 7, 2025
diff --git a/silnlp/common/check_books.py b/silnlp/common/check_books.py
@@ -50,7 +50,8 @@ def parse_book(project_dir: str, book: str):
 
     settings = FileParatextProjectSettingsParser(project_dir).parse()
     book_path = Path(project_dir) / settings.get_book_file_name(book)
-
+    LOGGER.info(f"Attempting to parse {book} from {book_path}.")
+
     if not book_path.is_file():
         raise RuntimeError(f"Can't find file {book_path} for book {book}")
 

diff --git a/silnlp/common/combine_scores_save.py b/silnlp/common/combine_scores_save.py
@@ -0,0 +1,116 @@
+import argparse
+import csv
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+
+from ..common.environment import SIL_NLP_ENV
+
+
+def check_for_lock_file(folder: Path, filename: str, file_type: str):
+    """Check for lock files and ask the user to close them then exit."""
+
+    if file_type[0] == ".":
+        file_type = file_type[1:]
+
+    if file_type.lower() == "csv":
+        lockfile = folder / f".~lock.{filename}.{file_type}#"
+    elif file_type.lower() == "xlsx":
+        lockfile = folder / f"~${filename}.{file_type}"
+
+    if lockfile.is_file():
+        print(f"Found lock file: {lockfile}")
+        print(f"Please close {filename}.{file_type} in folder {folder} OR delete the lock file and try again.")
+        sys.exit()
+
+
+def aggregate_csv(folder_path):
+    # Dictionary to store rows by header type
+    data_by_header = defaultdict(list)
+
+    # Iterate over all CSV files in the folder and its subfolders
+    for csv_file in folder_path.rglob("*/scores-*.csv"):
+        series = csv_file.parts[-3]  # Extract series folder name
+        experiment = csv_file.parts[-2]  # Extract experiment folder name
+        steps = csv_file.stem.split("-")[-1]  # Extract steps from file name
+
+        # Read the CSV file and add new columns
+        with open(csv_file, "r") as f:
+            reader = csv.reader(f)
+            rows = list(reader)
+            header = tuple(rows[0])  # Use tuple to make it hashable
+
+            # Add columns to the beginning of each row
+            if header not in data_by_header:
+                data_by_header[header].append(["Series", "Experiment", "Steps"] + list(header))
+            for row in rows[1:]:
+                data_by_header[header].append([series, experiment, steps] + row)
+
+    return data_by_header
+
+
+def write_to_csv(data_by_header, folder, output_filename):
+
+    output_file = folder / f"{output_filename}.csv"
+    with open(output_file, "w", newline="") as f:
+        writer = csv.writer(f)
+        for header, rows in data_by_header.items():
+            writer.writerows(rows)
+            writer.writerow([])  # Add a blank row to separate different types
+        # Write the folder path to the last line of the CSV file
+        writer.writerow([folder])
+    print(f"Wrote scores to {output_file}")
+
+
+def write_to_excel(data_by_header, folder, output_filename):
+    output_file = folder / f"{output_filename}.xlsx"
+    with pd.ExcelWriter(output_file) as writer:
+        for i, (header, rows) in enumerate(data_by_header.items()):
+            # Create a DataFrame for the current header
+            df = pd.DataFrame(rows[1:], columns=rows[0])
+            # Convert columns to appropriate data types
+            df = df.apply(pd.to_numeric, errors="ignore")
+            # Generate a unique sheet name
+            sheet_name = f"Table_{i + 1}"
+            # Write the DataFrame to the Excel file
+            df.to_excel(writer, sheet_name=sheet_name, index=False)
+    print(f"Wrote scores to {output_file}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Aggregate CSV files in a folder.")
+    parser.add_argument("folder", type=Path, help="Path to the folder containing CSV files.")
+    parser.add_argument(
+        "--output_filename",
+        type=str,
+        default="scores",
+        help="Filename suffix without the '.csv' or '.xlsx'. \
+            The folder name is added as a prefix to make it easier to distinguish scores files in search results.",
+    )
+    args = parser.parse_args()
+
+    folder = Path(args.folder)
+
+    csv_filename = f"{folder}_{args.output_filename}"
+    excel_filename = f"{folder}_{args.output_filename}"
+
+    if not folder.is_dir():
+        folder = Path(SIL_NLP_ENV.mt_experiments_dir) / args.folder
+
+    # Check for lock files and ask the user to close them.
+    check_for_lock_file(folder, csv_filename, "csv")
+    check_for_lock_file(folder, excel_filename, "xlsx")
+
+    data = aggregate_csv(folder)
+
+    # Write the aggregated data to a new CSV file
+    write_to_csv(data, folder, csv_filename)
+
+    # Write the aggregated data to an Excel file
+    write_to_excel(data, folder, excel_filename)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/silnlp/common/find_by_iso.py b/silnlp/common/find_by_iso.py
@@ -2,9 +2,11 @@
 import json
 import logging
 from pathlib import Path
-from typing import Dict, List, Set, Tuple, Union
+from typing import Dict, List, Set, Tuple
+import regex as re
 import sys
 
+
 from .environment import SIL_NLP_ENV
 from .iso_info import NLLB_ISO_SET, ALT_ISO
 
@@ -54,8 +56,6 @@ def find_related_isocodes(
     for iso_code in iso_codes:
         if iso_code in language_data:
             lang_info = language_data[iso_code]
-#            logger.info(f"{iso_code}: {lang_info['Name']}, {lang_info['Country']}, {lang_info['Family']}")
-
             iso_set.update(country_data.get(lang_info["Country"], []))
             iso_set.update(family_data.get(lang_info["Family"], []))
 
@@ -85,6 +85,21 @@ def split_files_by_projects(files: List[Path], projects_dir: Path) -> Tuple[Dict
 def get_equivalent_isocodes(iso_codes: List[str]) -> Set[str]:
     return {code for iso_code in iso_codes for code in (iso_code, ALT_ISO.get_alternative(iso_code)) if code}
 
+def filter_files(files: List[Path], excluded_patterns:List[str]) -> List[Path]:
+    filtered = []
+    date_pattern = re.compile(r'_\d{4}_\d{1,2}_\d{1,2}|_\d{1,2}_\d{1,2}_\d{4}')
+
+    for file in files:
+        parts = file.stem.split('-', 1)
+        if len(parts) != 2: continue
+        iso, name = parts
+        if date_pattern.search(name): continue
+        if len(iso) not in (2, 3): continue
+        if any(pattern.lower() in name.lower() for pattern in excluded_patterns): continue
+        if file.is_file() and file.stat().st_size < 100_000: continue
+        filtered.append(file)
+    return filtered
+
 def main():
     parser = argparse.ArgumentParser(description="Find related ISO language codes.")
     parser.add_argument("iso_codes", nargs="+", help="ISO codes to find related languages for")
@@ -97,7 +112,6 @@ def main():
 
     # Create a custom logger
     logger = logging.getLogger(__name__)
-    #logger.basicConfig()
 
     # Set the global logging level
     logger.setLevel(logging.INFO) 
@@ -151,7 +165,13 @@ def main():
 
     # Find files matching the codes
     files = get_files_by_iso(all_possible_codes, scripture_dir)
-    existing_projects, missing_projects = split_files_by_projects(files, projects_dir)
+
+    # Filter out AI and XRI files, and others.
+    excluded_patterns = ['XRI', '600M', '3.3B', '1.3B', 'words', 'name', 'clean', 'transcription','matthew', 'mark', 'mrk','luk']
+    filtered_files = filter_files(files, excluded_patterns)
+    print(f"There are {len(files)} files and {len(files)-len(filtered_files)} were filtered out.")
+
+    existing_projects, missing_projects = split_files_by_projects(filtered_files, projects_dir)
 
     # Display results
     if existing_projects:
@@ -163,8 +183,8 @@ def main():
         logger.info(f"\nThese {len(missing_projects)} files don't have a corresponding project folder:")
         for file, _ in missing_projects.items():
             logger.info(f"{file.stem}")
-    logger.info(f"\nAll the files:")
-    for file in files:
+    logger.info(f"\nFiltered files:")
+    for file in filtered_files:
         logger.info(f"    - {file.stem}")
 
     if not files:

diff --git a/silnlp/common/usfm_utils.py b/silnlp/common/usfm_utils.py