From a08ad6092e27ba9ce54cfb95b5b283bf140985da Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 14 Nov 2025 06:09:18 +0000 Subject: [PATCH 01/25] Add proof extraction functionality and example tests - Introduced ProofExtractor for extracting proofs from theorem, lemma, and example declarations. - Updated DeclInfo structure to include an optional proof field. - Added example theorems in complex.lean and simple.lean files. - Created test_user_example.lean to demonstrate dependency analysis and proof extraction. - Updated lakefile.toml to include TacticParser.Example library. --- .../tools/tactic_parser/TacticParser.lean | 1 + .../TacticParser/DependencyParser.lean | 6 +- .../TacticParser/Example/complex.lean | 25 +++ .../TacticParser/Example/simple.lean | 13 ++ .../TacticParser/ProofExtractor.lean | 151 ++++++++++++++++++ .../tactic_parser/TacticParser/Types.lean | 4 +- .../tools/tactic_parser/lakefile.toml | 3 + .../tactic_parser/test_user_example.lean | 26 +++ 8 files changed, 227 insertions(+), 2 deletions(-) create mode 100644 src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean create mode 100644 src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean create mode 100644 src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean create mode 100644 src/itp_interface/tools/tactic_parser/test_user_example.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser.lean b/src/itp_interface/tools/tactic_parser/TacticParser.lean index da652e7..b32baf9 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser.lean @@ -2,3 +2,4 @@ import TacticParser.Base64 import TacticParser.Types import TacticParser.SyntaxWalker import TacticParser.Main +import TacticParser.Example.simple diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean b/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean index 4e9cd0a..984aea6 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean @@ -3,6 +3,7 @@ import Lean.Data.Json import Lean.Elab.Frontend import TacticParser.Types import TacticParser.LineParser +import TacticParser.ProofExtractor namespace TacticParser @@ -261,6 +262,9 @@ unsafe def analyzeFileDependencies (filepath : System.FilePath) : IO FileDepende -- Parse all declarations using LineParser let decls ← parseFile filepath + -- Extract proofs from theorem/lemma/example declarations + let declsWithProofs ← extractProofsFromDecls decls content + -- Load environment with all imports and elaborate the file Lean.initSearchPath (← Lean.findSysroot) Lean.enableInitializersExecution @@ -282,7 +286,7 @@ unsafe def analyzeFileDependencies (filepath : System.FilePath) : IO FileDepende -- Analyze each declaration let mut declsWithDeps : Array DeclWithDependencies := #[] - for declInfo in decls do + for declInfo in declsWithProofs do -- Construct the fully qualified name for this declaration let declName := match declInfo.namespc with | some ns => diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean b/src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean new file mode 100644 index 0000000..f05e8e1 --- /dev/null +++ b/src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean @@ -0,0 +1,25 @@ +import TacticParser.Example.simple + +namespace TacticParser.Example + +theorem additive: +∀ {a b: Nat}, addNat a b = a + b := by + intro a b + induction a generalizing b + simp [addNat] + rename_i a ih + simp [addNat, *] + grind + +theorem additive_identity1 : +∀ {a : Nat}, addNat 0 a = a := by + simp [addNat] + +theorem additive_identity2 : +∀ {a : Nat}, addNat a 0 = a := by + simp [additive] + +theorem additive_comm: +∀ {a b : Nat}, addNat a b = addNat b a := by + simp [additive] + grind diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean b/src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean new file mode 100644 index 0000000..d7b1690 --- /dev/null +++ b/src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean @@ -0,0 +1,13 @@ +namespace TacticParser.Example + +theorem test : ∀ {a : Nat}, a + 0 = a := by grind + +theorem test1 : ∀ {a : Nat}, a + 0 = a + | 0 => by simp + | n + 1 => by simp + +def addNat : Nat → Nat → Nat + | 0, m => m + | n + 1, m => addNat n (m + 1) + +end TacticParser.Example diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean new file mode 100644 index 0000000..e363bfe --- /dev/null +++ b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean @@ -0,0 +1,151 @@ +/- +Proof extractor for theorem, lemma, and example declarations. +Uses a validation-based approach: tests candidate delimiters by replacing +the proof with `sorry` and checking if it parses successfully. +-/ +import Lean +import Lean.Parser +import TacticParser.Types +import TacticParser.SyntaxWalker +import Lean.Elab.Frontend + +namespace TacticParser + +open Lean Parser Elab + +/-- Represents a candidate delimiter position and type -/ +structure DelimiterCandidate where + position : Nat -- Byte position in the text + delimiterType : String -- ":=", "where", or "|" + deriving Repr, BEq + +instance : Ord DelimiterCandidate where + compare a b := compare a.position b.position + +/-- Try to parse a text snippet and return true if it parses without errors -/ +unsafe def tryParseSuccessfully (text : String) (cmdState? : Option Command.State): IO Bool := do + -- IO.println s!"Trying to parse text:\n{text}\n---" + -- IO.println s!"With cmdState?: {cmdState?.isSome}" + let (_, _, messages, _) ← try + IO.processInput text cmdState? Options.empty + catch e => + -- IO.println s!"Parsing exception: {e}" + return false + + let mut errorInfos : Array ErrorInfo := #[] + for msg in messages do + if msg.severity == .error then + let msgPos := Position.mk msg.pos.line msg.pos.column + let errorInfo := ErrorInfo.mk (← msg.data.toString) msgPos + errorInfos := errorInfos.push errorInfo + -- IO.println s!"Parsing error: {← msg.data.toString} at {msg.pos}" + if errorInfos.size > 0 then + return false + else + return true + +/-- Check if a substring starts with a given substring at a specific position -/ +def substringStartsWith (text : String) (startPos : Nat) (substr : String) : Bool := + if startPos + substr.length > text.length then + false + else + let extracted := text.drop startPos + extracted.startsWith substr + +/-- Find all occurrences of a substring in a text -/ +def findSubstrOccurences (text : String) (substr : String) : List Nat := + if substr.length > text.length then + [] + else + let all_pos := List.range (text.length - substr.length + 1) + let all_occurences := all_pos.filter (fun pos => substringStartsWith text pos substr) + all_occurences + +/-- Find all candidate delimiter positions -/ +def findCandidateDelimiters (text : String) : List DelimiterCandidate := + let assignPositions := findSubstrOccurences text ":=" + let wherePositions := findSubstrOccurences text "where" + let pipePositions := findSubstrOccurences text "|" + + let assignCandidates := assignPositions.map fun pos => + { position := pos, delimiterType := ":=" } + let whereCandidates := wherePositions.map fun pos => + { position := pos, delimiterType := "where" } + let pipeCandidates := pipePositions.map fun pos => + { position := pos, delimiterType := "|" } + + let allCandidates := assignCandidates ++ whereCandidates ++ pipeCandidates + allCandidates.toArray.qsort (fun a b => a.position < b.position) |>.toList + +/-- Extract proof from a declaration by testing candidate delimiters -/ +unsafe def extractProofFromDecl (declInfo : DeclInfo) (fileContent : String) : IO DeclInfo := do + -- Only process theorem, lemma, example + if declInfo.declType != DeclType.theorem && + declInfo.declType != DeclType.lemma && + declInfo.declType != DeclType.example then + return declInfo + + let text := declInfo.text + + -- Extract context before this declaration (imports + previous declarations) + -- Convert Position to byte offset + let mut contextBeforeDecl := + let lines := fileContent.splitOn "\n" + let beforeLines := lines.take (declInfo.startPos.line - 1) + String.intercalate "\n" beforeLines + contextBeforeDecl := contextBeforeDecl ++ "\n" + + -- IO.println s!"contextBeforeDecl:\n{contextBeforeDecl}\n---" + + let mut cmdState? : Option Command.State := none + try + let (initialCmdState, cmdState, messages, trees) ← IO.processInput contextBeforeDecl none Options.empty + cmdState? := some cmdState + catch e => + cmdState? := none + + -- IO.println s!"cmdState?: {cmdState?.isSome}" + + -- Find all candidate delimiters + let candidates := findCandidateDelimiters text + + -- IO.println s!"text:\n{text}\n---" + + -- Test each candidate + for candidate in candidates do + let beforeDelimiter := text.take candidate.position + + -- IO.println s!"beforeDelimiter:\n{beforeDelimiter}\n---" + + -- Build test text with full context + let statementOnly := match candidate.delimiterType with + | "|" => beforeDelimiter.trim ++ " := sorry" + | ":=" => beforeDelimiter ++ " := sorry" + | "where" => beforeDelimiter ++ " := sorry" + | _ => beforeDelimiter ++ " := sorry" + + -- IO.println s!"statementOnly:\n{statementOnly}\n---" + + -- Try to parse + let success ← tryParseSuccessfully statementOnly cmdState? + if success then + -- Found valid split! + -- IO.println s!"Found valid split at position {candidate.position} with delimiter {candidate.delimiterType}" + let proof := text.drop candidate.position + let thrm := text.take candidate.position + return { declInfo with proof := some proof.trim , text := thrm.trim } + -- else + -- IO.println s!"Failed split at position {candidate.position} with delimiter {candidate.delimiterType}" + + -- No valid split found - no proof + return { declInfo with proof := none } + +/-- Extract proofs from multiple declarations -/ +unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String) : IO (Array DeclInfo) := do + let mut result := #[] + for decl in decls do + let processed ← extractProofFromDecl decl fileContent + result := result.push processed + return result + +end TacticParser diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Types.lean b/src/itp_interface/tools/tactic_parser/TacticParser/Types.lean index fee67f1..5dbac32 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/Types.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/Types.lean @@ -71,6 +71,7 @@ structure DeclInfo where text : String docString : Option String -- Extracted documentation comment namespc : Option String -- Current namespace + proof : Option String := none -- Extracted proof (for theorem, lemma, example) deriving Repr instance : ToJson DeclInfo where @@ -83,7 +84,8 @@ instance : ToJson DeclInfo where ("end_column", d.endPos.column), ("text", toJson d.text), ("doc_string", toJson d.docString), - ("namespace", toJson d.namespc) + ("namespace", toJson d.namespc), + ("proof", toJson d.proof) ] /-- Information about an import statement -/ diff --git a/src/itp_interface/tools/tactic_parser/lakefile.toml b/src/itp_interface/tools/tactic_parser/lakefile.toml index 16e7605..45d2d4d 100644 --- a/src/itp_interface/tools/tactic_parser/lakefile.toml +++ b/src/itp_interface/tools/tactic_parser/lakefile.toml @@ -4,6 +4,9 @@ defaultTargets = ["tactic-parser", "dependency-parser"] [[lean_lib]] name = "TacticParser" +[[lean_lib]] +name = "TacticParser.Example" + [[lean_exe]] name = "tactic-parser" root = "TacticParser.Main" diff --git a/src/itp_interface/tools/tactic_parser/test_user_example.lean b/src/itp_interface/tools/tactic_parser/test_user_example.lean new file mode 100644 index 0000000..e8b4b94 --- /dev/null +++ b/src/itp_interface/tools/tactic_parser/test_user_example.lean @@ -0,0 +1,26 @@ +import Lean +import TacticParser.DependencyParser +import TacticParser.ProofExtractor +import TacticParser.Types + +open TacticParser + + +unsafe def analyze (filePath : String) : IO Unit := do + let fileDepAnalysis ← analyzeFileDependencies filePath + IO.println s!"Dependency Analysis for {filePath}:" + let json := Lean.toJson fileDepAnalysis + IO.println json.pretty + +def testCodes : List String := [ + "TacticParser/Example/simple.lean", + "TacticParser/Example/complex.lean" +] + +unsafe def main : IO Unit := do + IO.println "Testing User's Example" + IO.println (String.mk (List.replicate 70 '=')) + + for test in testCodes do + let filePath := test + analyze filePath From d24d0a1759816b5ec11c2952055652bbd55c6fdd Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 14 Nov 2025 06:09:41 +0000 Subject: [PATCH 02/25] Refactor data extraction and parsing: update FileDependencyAnalysis handling and add proof attribute --- .../lean4_local_data_extraction_transform.py | 23 ++++++++----- src/itp_interface/tools/tactic_parser.py | 34 ++++++++++++++++++- .../tools/training_data_format.py | 4 +-- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/itp_interface/tools/lean4_local_data_extraction_transform.py b/src/itp_interface/tools/lean4_local_data_extraction_transform.py index 9205638..6114416 100644 --- a/src/itp_interface/tools/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/tools/lean4_local_data_extraction_transform.py @@ -10,8 +10,9 @@ import uuid from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType -from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, ExtractionDataCollection, TheoremProvingTrainingDataFormat +from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, ExtractionDataCollection from itp_interface.tools.training_data import TrainingData, DataLayoutFormat +from itp_interface.tools.tactic_parser import FileDependencyAnalysis class Local4DataExtractionTransform(GenericTrainingDataGenerationTransform): def __init__(self, @@ -62,13 +63,19 @@ def __call__(self, self.logger.info(f"Extracted {len(file_dep_analyses)} FileDependencyAnalysis objects from {file_namespace}") self.logger.info(f"file_dep_analyses: {file_dep_analyses}") assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" - file_dep_analysis = file_dep_analyses[0] - for decls in file_dep_analysis.declarations: - line_info = decls.decl_info - if theorems is not None and line_info.name not in theorems: - continue - training_data.merge(decls) - cnt += 1 + for fda in file_dep_analyses: + for decl in fda.declarations: + new_fda = FileDependencyAnalysis( + file_path=fda.file_path, + module_name=fda.module_name, + imports=fda.imports, + declarations=[]) + line_info = decl.decl_info + if theorems is not None and line_info.name not in theorems: + continue + new_fda.declarations.append(decl) + training_data.merge(new_fda) + cnt += 1 training_data.meta.last_proof_id = theorem_id self.logger.info(f"===============Finished processing {file_namespace}=====================") self.logger.info(f"Total declarations processed in this transform: {cnt}") diff --git a/src/itp_interface/tools/tactic_parser.py b/src/itp_interface/tools/tactic_parser.py index 966f781..66aaae0 100644 --- a/src/itp_interface/tools/tactic_parser.py +++ b/src/itp_interface/tools/tactic_parser.py @@ -106,6 +106,7 @@ class LeanLineInfo(BaseModel): name: Optional[str] = None doc_string: Optional[str] = None namespace: Optional[str] = None + proof: Optional[str] = None # Full proof text (if available) def __repr__(self) -> str: return f"LeanLineInfo(text={self.text!r}, line={self.line}, column={self.column})" @@ -188,7 +189,8 @@ def from_dict(decl_data: Dict) -> 'DeclWithDependencies': decl_type=decl_dict.get('declType'), name=decl_dict.get('name'), doc_string=decl_dict.get('docString'), - namespace=decl_dict.get('namespace') + namespace=decl_dict.get('namespace'), + proof=decl_dict.get('proof') ) # Parse dependencies @@ -248,6 +250,23 @@ class FileDependencyAnalysis(BaseModel): def __repr__(self) -> str: return f"FileDependencyAnalysis({self.module_name}, {len(self.declarations)} decls)" + + def to_json(self, indent=0) -> str: + if indent == 0: + return self.model_dump_json() + else: + return self.model_dump_json(indent=indent) + + @staticmethod + def load_from_string(json_text: str) -> 'FileDependencyAnalysis': + return FileDependencyAnalysis.model_validate_json(json_text) + + @staticmethod + def load_from_file(file_path: str) -> 'FileDependencyAnalysis': + with open(file_path, 'r', encoding='utf-8') as f: + data = f.read() + return FileDependencyAnalysis.load_from_string(data) + # Create an enum for parsing request type class RequestType(Enum): @@ -738,6 +757,19 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] logging.basicConfig(level=logging.DEBUG) project_path = str(Path(__file__).parent.parent.parent / "data" / "test" / "lean4_proj") + with TacticParser() as parser: + # Example 0: Empty proof + lean_code = """theorem test : ∀ {a : Nat}, a + 0 = a +| 0 => by simp +| n + 1 => by simp +""" + + print("Parsing example 0...") + tactics, errors = parser.parse(lean_code) + print_tactics(tactics) + if errors: + print(f"Error: {errors}") + with TacticParser() as parser: # Example 1: Simple proof lean_code = "example : True := by trivial" diff --git a/src/itp_interface/tools/training_data_format.py b/src/itp_interface/tools/training_data_format.py index e7dabfa..0e55073 100644 --- a/src/itp_interface/tools/training_data_format.py +++ b/src/itp_interface/tools/training_data_format.py @@ -12,7 +12,7 @@ from collections import OrderedDict from typing import List, Optional, Union, runtime_checkable, Protocol from pydantic import BaseModel -from itp_interface.tools.tactic_parser import DeclWithDependencies +from itp_interface.tools.tactic_parser import FileDependencyAnalysis @runtime_checkable class TrainingDataFormat(Protocol): @@ -503,7 +503,7 @@ def load_from_string(json_text: str, logger: logging.Logger = None): return deserialized class ExtractionDataCollection(BaseModel): - training_data: list[DeclWithDependencies] = [] + training_data: list[FileDependencyAnalysis] = [] def __len__(self) -> int: return len(self.training_data) From faacec6ccda799734eb817a01d3309a90601d7da Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Sat, 15 Nov 2025 05:59:50 +0000 Subject: [PATCH 03/25] Added graph data collection tools. --- src/itp_interface/main/config.py | 9 +- .../benchmark/simple_benchmark_lean_ext.yaml | 6 +- src/itp_interface/main/run_tool.py | 27 +- .../lean4_local_data_extraction_transform.py | 177 +++- src/itp_interface/tools/simple_sqlite.py | 830 ++++++++++++++++++ 5 files changed, 995 insertions(+), 54 deletions(-) create mode 100644 src/itp_interface/tools/simple_sqlite.py diff --git a/src/itp_interface/main/config.py b/src/itp_interface/main/config.py index 5d0c042..c4239e1 100644 --- a/src/itp_interface/main/config.py +++ b/src/itp_interface/main/config.py @@ -79,6 +79,7 @@ class ExtractFile(object): class EvalDataset(object): project: str files: typing.Union[typing.List[EvalFile], typing.List[ExtractFile]] + exclude_files: typing.List[str] = field(default_factory=list) @dataclass_json @dataclass @@ -135,7 +136,6 @@ def add_theorem_to_maps(self, path: str, theorem: str, proof_result: ProofSearch def parse_config(cfg): - is_extraction_request = False env_settings_cfg = cfg["env_settings"] env_settings = EnvSettings( name=env_settings_cfg["name"], @@ -179,7 +179,6 @@ def parse_config(cfg): eval_files.append(EvalFile( path=file_cfg["path"], theorems=theorems)) - is_extraction_request = False elif "declarations" in file_cfg: declarations = None if type(file_cfg["declarations"]) == str: @@ -189,12 +188,12 @@ def parse_config(cfg): eval_files.append(ExtractFile( path=file_cfg["path"], declarations=declarations)) - is_extraction_request = True else: raise ValueError(f"File config must have either 'theorems' or 'declarations': {file_cfg}") eval_datasets.append(EvalDataset( project=dataset_cfg["project"], - files=eval_files)) + files=eval_files, + exclude_files=dataset_cfg.get("exclude_files", []))) language = ProofAction.Language(benchmark_cfg["language"]) benchmark = EvalBenchmark( name=benchmark_cfg["name"], @@ -205,6 +204,6 @@ def parse_config(cfg): few_shot_metadata_filename_for_retrieval=benchmark_cfg["few_shot_metadata_filename_for_retrieval"], dfs_data_path_for_retrieval=benchmark_cfg["dfs_data_path_for_retrieval"], dfs_metadata_filename_for_retrieval=benchmark_cfg["dfs_metadata_filename_for_retrieval"], - is_extraction_request=is_extraction_request and benchmark_cfg.get("is_extraction_request", False), + is_extraction_request=benchmark_cfg.get("is_extraction_request", False), setup_cmds=benchmark_cfg["setup_cmds"] if "setup_cmds" in benchmark_cfg else []) return Experiments(env_settings=env_settings, run_settings=eval_settings, benchmark=benchmark) \ No newline at end of file diff --git a/src/itp_interface/main/configs/benchmark/simple_benchmark_lean_ext.yaml b/src/itp_interface/main/configs/benchmark/simple_benchmark_lean_ext.yaml index 2ed8c1d..586c2f8 100644 --- a/src/itp_interface/main/configs/benchmark/simple_benchmark_lean_ext.yaml +++ b/src/itp_interface/main/configs/benchmark/simple_benchmark_lean_ext.yaml @@ -8,6 +8,6 @@ dfs_metadata_filename_for_retrieval: is_extraction_request: true datasets: - project: src/data/test/lean4_proj - files: - - path: Lean4Proj/Basic.lean - declarations: "*" \ No newline at end of file + files: [] + exclude_files: + - src/data/test/lean4_proj/.lake \ No newline at end of file diff --git a/src/itp_interface/main/run_tool.py b/src/itp_interface/main/run_tool.py index 7fbd7da..a6ce92d 100644 --- a/src/itp_interface/main/run_tool.py +++ b/src/itp_interface/main/run_tool.py @@ -26,6 +26,7 @@ RayResourcePoolActor = None TimedRayExec = None RayUtils = None +from pathlib import Path from itp_interface.rl.proof_action import ProofAction from itp_interface.tools.isabelle_server import IsabelleServer from itp_interface.rl.simple_proof_env import ProofEnvReRankStrategy @@ -122,12 +123,20 @@ def _get_all_lemmas_impl( def _get_all_lean_files_in_folder_recursively( - project_folder: str) -> typing.List[str]: + project_folder: str, exclude_list: typing.List[str]) -> typing.List[str]: lean_files = [] + project_folder_path = Path(project_folder) for root, dirs, files in os.walk(project_folder): for file in files: if file.endswith(".lean"): - lean_files.append(os.path.join(root, file)) + # Get full file path + full_path = os.path.join(root, file) + # Check if the full path starts with any exclude path + if any(full_path.startswith(exclude) for exclude in exclude_list): + continue + full_path_obj = Path(full_path) + rel_to_project = str(full_path_obj.relative_to(project_folder_path)) + lean_files.append(rel_to_project) return lean_files # Create Ray remote version if Ray is available @@ -248,7 +257,7 @@ def create_yaml(project_to_theorems, name, eval_benchmark: EvalBenchmark, output with open(output_file, 'w') as yaml_file: yaml.dump(data, yaml_file, sort_keys=False) -def add_transform(experiment: Experiments, clone_dir: str, resources: list, transforms: list, logger: logging.Logger = None): +def add_transform(experiment: Experiments, clone_dir: str, resources: list, transforms: list, logger: logging.Logger = None, str_time: str = None): global ray_resource_pool if experiment.run_settings.transform_type == TransformType.LOCAL: if experiment.benchmark.language == ProofAction.Language.LEAN: @@ -260,10 +269,14 @@ def add_transform(experiment: Experiments, clone_dir: str, resources: list, tran os.makedirs(clone_dir, exist_ok=True) elif experiment.benchmark.language == ProofAction.Language.LEAN4: if experiment.benchmark.is_extraction_request: + output_dir_path = os.path.join(experiment.run_settings.output_dir, str_time) + os.makedirs(output_dir_path, exist_ok=True) + db_path = os.path.join(output_dir_path, "lean4_extraction_db.sqlite") transform = Local4DataExtractionTransform( experiment.run_settings.dep_depth, buffer_size=experiment.run_settings.buffer_size, - logger=logger) + logger=logger, + db_path=db_path) else: transform = Local4DataGenerationTransform( experiment.run_settings.dep_depth, @@ -320,8 +333,10 @@ def get_decl_lemmas_to_parse( if experiment.benchmark.language == ProofAction.Language.LEAN4 \ and experiment.benchmark.is_extraction_request: if len(dataset.files) == 0: + logger.warning(f"No files specified for Lean4 extraction in dataset {dataset.project}, extracting from all Lean4 files in the project") # List all the files recursively in the project folder - files_in_dataset = _get_all_lean_files_in_folder_recursively(dataset.project) + files_in_dataset = _get_all_lean_files_in_folder_recursively(dataset.project, dataset.exclude_files) + logger.info(f"Found {len(files_in_dataset)} Lean4 files in the project {dataset.project}") for file_path in files_in_dataset: file_to_theorems[file_path] = ["*"] file_args[file_path] = {} @@ -459,7 +474,7 @@ def run_data_generation_pipeline(experiment: Experiments, log_dir: str, checkpoi transforms = [] str_time = time.strftime("%Y%m%d-%H%M%S") clone_dir = os.path.join(experiment.run_settings.output_dir, "clone{}".format(str_time)) - clone_dir = add_transform(experiment, clone_dir, resources, transforms, logger) + clone_dir = add_transform(experiment, clone_dir, resources, transforms, logger, str_time) # Find all the lemmas to prove project_to_theorems = {} other_args = {} diff --git a/src/itp_interface/tools/lean4_local_data_extraction_transform.py b/src/itp_interface/tools/lean4_local_data_extraction_transform.py index 6114416..673d0fb 100644 --- a/src/itp_interface/tools/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/tools/lean4_local_data_extraction_transform.py @@ -8,11 +8,13 @@ sys.path.append(root_dir) import typing import uuid +from pathlib import Path from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, ExtractionDataCollection from itp_interface.tools.training_data import TrainingData, DataLayoutFormat from itp_interface.tools.tactic_parser import FileDependencyAnalysis +from itp_interface.tools.simple_sqlite import LeanDeclarationDB class Local4DataExtractionTransform(GenericTrainingDataGenerationTransform): def __init__(self, @@ -20,11 +22,13 @@ def __init__(self, max_search_results = None, buffer_size : int = 10000, logger = None, - max_parallelism : int = 4): + max_parallelism : int = 4, + db_path : typing.Optional[str] = None): super().__init__(TrainingDataGenerationType.LOCAL, buffer_size, logger) self.depth = depth self.max_search_results = max_search_results self.max_parallelism = max_parallelism + self.db_path = db_path # Store path, don't create connection yet (for Ray actors) def get_meta_object(self) -> TrainingDataMetadataFormat: return TrainingDataMetadataFormat( @@ -41,45 +45,135 @@ def load_meta_from_file(self, file_path) -> MergableCollection: def load_data_from_file(self, file_path) -> MergableCollection: return ExtractionDataCollection.load_from_file(file_path, self.logger) - def __call__(self, - training_data: TrainingData, - project_id : str, - lean_executor: SimpleLean4SyncExecutor, - print_coq_executor_callback: typing.Callable[[], SimpleLean4SyncExecutor], - theorems: typing.List[str] = None, + def _remove_lake_package_prefix(self, module_name: str) -> str: + if module_name.startswith("lake.packages.") or module_name.startswith(".lake.packages."): + parts = module_name.split('.') + if len(parts) > 3: + module_name = '.'.join(parts[3:]) + else: + module_name = '' + return module_name + + def _remove_lake_file_prefix(self, path: Path) -> str: + parts = path.parts + if ".lake" in parts: + lake_index = parts.index(".lake") + new_parts = parts[lake_index + 3:] + new_path = Path(*new_parts) + return str(new_path) + return str(path) + + def _get_file_path(self, project_path: str, file_path: str) -> str: + fp = Path(file_path) + pp = Path(project_path) + fp_abs = fp.resolve() + pp_abs = pp.resolve() + relative_path = fp_abs.relative_to(pp_abs) + rel_file_path = self._remove_lake_file_prefix(relative_path) + return str(rel_file_path) + + def _get_module_name(self, project_path: str, module_name: str) -> str: + pp = Path(project_path) + pp_abs = pp.resolve() + pp_module = str(pp_abs).replace('/', '.') + pp_module = pp_module.lstrip('.') + # self.logger.info(f"Project module prefix: {pp_module}") + if module_name.startswith(pp_module): + module_name = module_name[len(pp_module):] + module_name = module_name.lstrip('.') + # self.logger.info(f"Module name after removing project prefix: {module_name}") + module_name = self._remove_lake_package_prefix(module_name) + return module_name + + def __call__(self, + training_data: TrainingData, + project_id : str, + lean_executor: SimpleLean4SyncExecutor, + print_coq_executor_callback: typing.Callable[[], SimpleLean4SyncExecutor], + theorems: typing.List[str] = None, other_args: dict = {}) -> TrainingData: - file_namespace = lean_executor.main_file.replace('/', '.') + file_path = lean_executor.main_file + project_path = project_id + rel_file_path = self._get_file_path(project_path, file_path) + file_namespace = rel_file_path.replace('/', '.') self.logger.info(f"=========================Processing {file_namespace}=========================") - theorem_id = str(uuid.uuid4()) - if isinstance(theorems, list) and len(theorems) == 1 and theorems[0] == "*": - theorems = None - else: - theorems = set(theorems) if theorems is not None else None - cnt = 0 - temp_dir = os.path.join(training_data.folder, "temp") - os.makedirs(temp_dir, exist_ok=True) - json_output_path = f"{temp_dir}/{file_namespace.replace('.', '_')}.lean.deps.json" - file_dep_analyses = lean_executor.extract_all_theorems_and_definitions(json_output_path=json_output_path) - self.logger.info(f"Extracted {len(file_dep_analyses)} FileDependencyAnalysis objects from {file_namespace}") - self.logger.info(f"file_dep_analyses: {file_dep_analyses}") - assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" - for fda in file_dep_analyses: - for decl in fda.declarations: - new_fda = FileDependencyAnalysis( - file_path=fda.file_path, - module_name=fda.module_name, - imports=fda.imports, - declarations=[]) - line_info = decl.decl_info - if theorems is not None and line_info.name not in theorems: - continue - new_fda.declarations.append(decl) - training_data.merge(new_fda) - cnt += 1 - training_data.meta.last_proof_id = theorem_id - self.logger.info(f"===============Finished processing {file_namespace}=====================") - self.logger.info(f"Total declarations processed in this transform: {cnt}") - return training_data + + # Create database connection for this Ray actor (if db_path is provided) + db = None + if self.db_path: + db = LeanDeclarationDB(self.db_path) + self.logger.info(f"Connected to database: {self.db_path}") + + try: + if isinstance(theorems, list) and len(theorems) == 1 and theorems[0] == "*": + theorems = None + else: + theorems = set(theorems) if theorems is not None else None + cnt = 0 + temp_dir = os.path.join(training_data.folder, "temp") + os.makedirs(temp_dir, exist_ok=True) + json_output_path = f"{temp_dir}/{file_namespace.replace('.', '_')}.lean.deps.json" + file_dep_analyses = lean_executor.extract_all_theorems_and_definitions(json_output_path=json_output_path) + self.logger.info(f"Extracted {len(file_dep_analyses)} FileDependencyAnalysis objects from {file_namespace}") + self.logger.info(f"file_dep_analyses: {file_dep_analyses}") + assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" + + last_decl_id = None + for fda in file_dep_analyses: + fda_rel_path = self._get_file_path(project_path, fda.file_path) + # Remove the pp_module prefix from the fda module name + fda_module_name = self._get_module_name(project_path, fda.module_name) + + # Insert file and imports into database (if db is enabled) + if db and fda.imports: + db.insert_file_imports(fda_rel_path, fda_module_name, fda.imports) + self.logger.info(f"Inserted file and {len(fda.imports)} imports for {fda_rel_path}") + + for decl in fda.declarations: + # self.logger.info(f"Processing module: {project_path}") + # self.logger.info(f"Relative file path: {fda_rel_path}") + # self.logger.info(f"Original Module name: {fda.module_name}") + # self.logger.info(f"Rel Module name: {fda_module_name}") + + # Get or create decl_id from database (or generate new one if no DB) + if db: + decl_id = db.process_declaration( + fda_file_path=fda_rel_path, + fda_module_name=fda_module_name, + decl=decl + ) + self.logger.info(f"Processed declaration '{decl.decl_info.name}' with ID: {decl_id}") + else: + # Fallback: generate unique ID without database + import uuid + timestamp = str(int(uuid.uuid1().time_low)) + random_id = str(uuid.uuid4()) + decl_id = f"{timestamp}_{random_id}" + + new_fda = FileDependencyAnalysis( + file_path=str(fda_rel_path), + module_name=fda_module_name, + imports=fda.imports, + declarations=[]) + line_info = decl.decl_info + if theorems is not None and line_info.name not in theorems: + continue + decl.decl_id = decl_id + new_fda.declarations.append(decl) + training_data.merge(new_fda) + cnt += 1 + last_decl_id = decl_id + + if last_decl_id: + training_data.meta.last_proof_id = last_decl_id + self.logger.info(f"===============Finished processing {file_namespace}=====================") + self.logger.info(f"Total declarations processed in this transform: {cnt}") + return training_data + finally: + # Clean up database connection + if db: + db.close() + self.logger.info("Closed database connection") if __name__ == "__main__": @@ -91,11 +185,12 @@ def __call__(self, project_dir = 'data/test/Mathlib' # file_name = 'data/test/lean4_proj/Lean4Proj/Basic.lean' file_name = 'data/test/Mathlib/.lake/packages/mathlib/Mathlib/Algebra/Divisibility/Basic.lean' - project_id = project_dir.replace('/', '.') + project_id = project_dir #.replace('/', '.') time_str = time.strftime("%Y%m%d-%H%M%S") output_path = f".log/local_data_generation_transform/data/{time_str}" log_path = f".log/local_data_generation_transform/log/{time_str}" log_file = f"{log_path}/local_data_generation_transform-{time_str}.log" + db_path = f"{log_path}/lean_declarations.db" os.makedirs(output_path, exist_ok=True) os.makedirs(log_path, exist_ok=True) logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') @@ -104,7 +199,9 @@ def _print_lean_executor_callback(): search_lean_exec = SimpleLean4SyncExecutor(main_file=file_name, project_root=project_dir) search_lean_exec.__enter__() return search_lean_exec - transform = Local4DataExtractionTransform(0, buffer_size=1000) + # Create transform with database enabled + transform = Local4DataExtractionTransform(0, buffer_size=1000, db_path=db_path) + logger.info(f"Using database: {db_path}") training_data = TrainingData( output_path, "training_metadata.json", diff --git a/src/itp_interface/tools/simple_sqlite.py b/src/itp_interface/tools/simple_sqlite.py new file mode 100644 index 0000000..5eacefc --- /dev/null +++ b/src/itp_interface/tools/simple_sqlite.py @@ -0,0 +1,830 @@ +""" +Simple SQLite database for storing and querying Lean declaration data. + +This module provides a thread-safe SQLite database for storing Lean declarations, +their dependencies, and file metadata. Designed to work with Ray actors processing +files in parallel. +""" + +import sqlite3 +import uuid +from typing import List, Dict, Any, Optional +from contextlib import contextmanager + + +class LeanDeclarationDB: + """ + Thread-safe SQLite database for storing Lean file and declaration information. + + Key features: + - Automatic ID assignment on first discovery (declaration or dependency) + - Simplified dependency storage (edges only: A depends on B) + - Thread-safe operations with WAL mode for concurrent Ray actors + - Idempotent operations for safe parallel execution + """ + + def __init__(self, db_path: str, timeout: float = 30.0): + """ + Initialize the database connection. + + Args: + db_path: Path to the SQLite database file + timeout: Timeout in seconds for database locks (default 30s) + """ + self.db_path = db_path + self.timeout = timeout + self.conn = sqlite3.connect(db_path, timeout=timeout, check_same_thread=False) + # Explicitly set UTF-8 encoding + self.conn.execute("PRAGMA encoding = 'UTF-8'") + self.conn.row_factory = sqlite3.Row + # Enable WAL mode BEFORE creating tables for better concurrency + self.enable_wal_mode() + # Create tables (safe with IF NOT EXISTS even from multiple actors) + self._create_tables() + + def _generate_unique_id(self) -> str: + """ + Generate a unique ID for a declaration. + + Format: {timestamp}_{uuid4} + Same format as used in lean4_local_data_extraction_transform.py + + Returns: + Unique identifier string + """ + timestamp = str(int(uuid.uuid1().time_low)) + random_id = str(uuid.uuid4()) + return f"{timestamp}_{random_id}" + + def enable_wal_mode(self): + """ + Enable Write-Ahead Logging mode for better concurrent write performance. + + WAL mode allows multiple readers and one writer to access the database + simultaneously, which is essential for Ray actors processing files in parallel. + """ + self.conn.execute("PRAGMA journal_mode=WAL") + self.conn.execute("PRAGMA synchronous=NORMAL") + self.conn.commit() + + def _create_tables(self): + """ + Create the database schema with proper indexes and constraints. + + Safe for concurrent execution from multiple Ray actors: + - Uses CREATE TABLE IF NOT EXISTS (idempotent) + - Uses CREATE INDEX IF NOT EXISTS (idempotent) + - WAL mode is enabled before this is called + - 30s timeout handles lock contention + """ + cursor = self.conn.cursor() + + # Files table - stores file metadata + cursor.execute(""" + CREATE TABLE IF NOT EXISTS files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_path TEXT UNIQUE NOT NULL, + module_name TEXT NOT NULL + ) + """) + + # Imports table - stores file import relationships + cursor.execute(""" + CREATE TABLE IF NOT EXISTS imports ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file_id INTEGER NOT NULL, + end_pos INTEGER, + module_name TEXT, + start_pos INTEGER, + text TEXT, + FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE + ) + """) + + # Declarations table - stores all declarations (complete or partial) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS declarations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + decl_id TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, + namespace TEXT, + file_path TEXT, + module_name TEXT, + decl_type TEXT, + text TEXT, + line INTEGER, + column INTEGER, + end_line INTEGER, + end_column INTEGER, + doc_string TEXT, + proof TEXT, + -- A declaration is uniquely identified by its name and location + UNIQUE(name, namespace, file_path, module_name) + ) + """) + + # Simplified dependencies table - stores only edges (A depends on B) + cursor.execute(""" + CREATE TABLE IF NOT EXISTS declaration_dependencies ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + from_decl_id TEXT NOT NULL, + to_decl_id TEXT NOT NULL, + UNIQUE(from_decl_id, to_decl_id), + FOREIGN KEY (from_decl_id) REFERENCES declarations(decl_id) ON DELETE CASCADE, + FOREIGN KEY (to_decl_id) REFERENCES declarations(decl_id) ON DELETE CASCADE + ) + """) + + # Create indexes for faster queries + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_files_path + ON files(file_path) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_files_module + ON files(module_name) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_declarations_name + ON declarations(name) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_declarations_namespace + ON declarations(namespace) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_declarations_decl_id + ON declarations(decl_id) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_declarations_lookup + ON declarations(name, namespace, file_path, module_name) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_dependencies_from + ON declaration_dependencies(from_decl_id) + """) + cursor.execute(""" + CREATE INDEX IF NOT EXISTS idx_dependencies_to + ON declaration_dependencies(to_decl_id) + """) + + self.conn.commit() + + @contextmanager + def transaction(self): + """ + Context manager for database transactions. + + Usage: + with db.transaction(): + db.insert_something(...) + db.insert_something_else(...) + """ + try: + yield self.conn + self.conn.commit() + except Exception as e: + self.conn.rollback() + raise e + + def get_or_create_decl_id( + self, + name: str, + namespace: Optional[str] = None, + file_path: Optional[str] = None, + module_name: Optional[str] = None + ) -> str: + """ + Get existing decl_id or create a new one for a declaration. + + This is the core method for ID assignment. IDs are assigned as soon as + a declaration is discovered (either as a dependency or as a declaration itself). + + Args: + name: Declaration name (required) + namespace: Namespace (can be None) + file_path: File path (can be None for unresolved dependencies) + module_name: Module name (can be None for unresolved dependencies) + + Returns: + The decl_id (existing or newly created) + """ + cursor = self.conn.cursor() + + # Try to find existing declaration + # Handle NULL values properly in SQL + cursor.execute(""" + SELECT decl_id FROM declarations + WHERE name = ? + AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL)) + AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL)) + AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL)) + """, (name, namespace, namespace, file_path, file_path, module_name, module_name)) + + row = cursor.fetchone() + if row: + return row[0] # Return existing ID + + # Generate new ID and insert minimal record + new_decl_id = self._generate_unique_id() + + try: + cursor.execute(""" + INSERT INTO declarations (decl_id, name, namespace, file_path, module_name) + VALUES (?, ?, ?, ?, ?) + """, (new_decl_id, name, namespace, file_path, module_name)) + self.conn.commit() + except sqlite3.IntegrityError: + # Race condition: another process inserted it between our SELECT and INSERT + # Query again to get the existing ID + cursor.execute(""" + SELECT decl_id FROM declarations + WHERE name = ? + AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL)) + AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL)) + AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL)) + """, (name, namespace, namespace, file_path, file_path, module_name, module_name)) + row = cursor.fetchone() + if row: + return row[0] + else: + # This shouldn't happen, but raise if it does + raise + + return new_decl_id + + def upsert_declaration_full_info( + self, + decl_id: str, + name: str, + namespace: Optional[str], + file_path: str, + module_name: str, + decl_type: Optional[str] = None, + text: Optional[str] = None, + line: Optional[int] = None, + column: Optional[int] = None, + end_line: Optional[int] = None, + end_column: Optional[int] = None, + doc_string: Optional[str] = None, + proof: Optional[str] = None + ): + """ + Update a declaration with complete information. + + This is called when we process the actual declaration (not just a reference). + Updates the record with full metadata. + + Args: + decl_id: The declaration ID + name: Declaration name + namespace: Namespace + file_path: File path + module_name: Module name + decl_type: Declaration type (theorem, def, etc.) + text: Full declaration text + line: Starting line number + column: Starting column number + end_line: Ending line number + end_column: Ending column number + doc_string: Documentation string + proof: Proof text + """ + cursor = self.conn.cursor() + + cursor.execute(""" + UPDATE declarations + SET decl_type = ?, + text = ?, + line = ?, + column = ?, + end_line = ?, + end_column = ?, + doc_string = ?, + proof = ? + WHERE decl_id = ? + """, (decl_type, text, line, column, end_line, end_column, doc_string, proof, decl_id)) + + self.conn.commit() + + def insert_dependency_edge(self, from_decl_id: str, to_decl_id: str): + """ + Insert a dependency edge: from_decl_id depends on to_decl_id. + + Uses INSERT OR IGNORE for idempotency (safe to call multiple times). + + Args: + from_decl_id: The declaration that has the dependency + to_decl_id: The declaration being depended on + """ + cursor = self.conn.cursor() + + cursor.execute(""" + INSERT OR IGNORE INTO declaration_dependencies (from_decl_id, to_decl_id) + VALUES (?, ?) + """, (from_decl_id, to_decl_id)) + + self.conn.commit() + + def process_fda_list(self, fda_list: List) -> List[str]: + """ + Process a list of FileDependencyAnalysis objects. + + Args: + fda_list: List of FileDependencyAnalysis objects + + Returns: + List of all decl_ids that were processed + """ + all_decl_ids = [] + + for fda in fda_list: + # Insert file and imports first + if fda.imports: + self.insert_file_imports(fda.file_path, fda.module_name, fda.imports) + + # Process each declaration in this file + for decl in fda.declarations: + decl_id = self.process_declaration(fda.file_path, fda.module_name, decl) + all_decl_ids.append(decl_id) + + return all_decl_ids + + def process_declaration( + self, + fda_file_path: str, + fda_module_name: str, + decl + ) -> str: + """ + Process a declaration from a FileDependencyAnalysis object. + + This is the main high-level method that: + 1. Gets or creates decl_id for this declaration + 2. Updates full declaration info + 3. Processes all dependencies and creates edges + + Args: + fda_file_path: File path from FileDependencyAnalysis + fda_module_name: Module name from FileDependencyAnalysis + decl: DeclWithDependencies object + + Returns: + The decl_id for this declaration + """ + # Get or create ID for this declaration + decl_id = self.get_or_create_decl_id( + name=decl.decl_info.name, + namespace=decl.decl_info.namespace, + file_path=fda_file_path, + module_name=fda_module_name + ) + + # Update with full declaration info + self.upsert_declaration_full_info( + decl_id=decl_id, + name=decl.decl_info.name, + namespace=decl.decl_info.namespace, + file_path=fda_file_path, + module_name=fda_module_name, + decl_type=decl.decl_info.decl_type, + text=decl.decl_info.text, + line=decl.decl_info.line, + column=decl.decl_info.column, + end_line=decl.decl_info.end_line, + end_column=decl.decl_info.end_column, + doc_string=decl.decl_info.doc_string, + proof=decl.decl_info.proof + ) + + # Process dependencies + for dep in decl.dependencies: + # Get or create ID for the dependency + dep_decl_id = self.get_or_create_decl_id( + name=dep.name, + namespace=dep.namespace, + file_path=dep.file_path, + module_name=dep.module_name + ) + + # Propagate the decl_id back to the dependency object + dep.decl_id = dep_decl_id + + # Insert dependency edge + self.insert_dependency_edge(decl_id, dep_decl_id) + + return decl_id + + def get_or_create_file(self, file_path: str, module_name: str) -> int: + """ + Get or create a file record. + + Args: + file_path: The file path + module_name: The module name + + Returns: + The file_id (existing or newly created) + """ + cursor = self.conn.cursor() + + cursor.execute("SELECT id FROM files WHERE file_path = ?", (file_path,)) + row = cursor.fetchone() + if row: + return row[0] + + cursor.execute(""" + INSERT INTO files (file_path, module_name) + VALUES (?, ?) + """, (file_path, module_name)) + self.conn.commit() + + file_id = cursor.lastrowid + if file_id is None: + raise RuntimeError("Failed to get file_id after insert") + return file_id + + def insert_file_imports(self, file_path: str, module_name: str, imports: List[Dict]): + """ + Insert imports for a file. + + Args: + file_path: The file path + module_name: The module name + imports: List of import dictionaries + """ + file_id = self.get_or_create_file(file_path, module_name) + cursor = self.conn.cursor() + + for import_data in imports: + cursor.execute(""" + INSERT INTO imports (file_id, end_pos, module_name, start_pos, text) + VALUES (?, ?, ?, ?, ?) + """, (file_id, import_data.get('end_pos'), import_data.get('module_name'), + import_data.get('start_pos'), import_data.get('text'))) + + self.conn.commit() + + # Query methods + + def get_declaration_by_decl_id(self, decl_id: str) -> Optional[Dict[str, Any]]: + """ + Get a declaration by its decl_id. + + Args: + decl_id: The unique declaration ID + + Returns: + Dictionary containing declaration information or None + """ + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM declarations WHERE decl_id = ?", (decl_id,)) + row = cursor.fetchone() + return dict(row) if row else None + + def get_declarations_by_name( + self, + name: str, + namespace: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + Get declarations by name and optionally namespace. + + Args: + name: The declaration name + namespace: Optional namespace to filter by + + Returns: + List of dictionaries containing declaration information + """ + cursor = self.conn.cursor() + + if namespace is not None: + cursor.execute(""" + SELECT * FROM declarations + WHERE name = ? AND namespace = ? + """, (name, namespace)) + else: + cursor.execute(""" + SELECT * FROM declarations + WHERE name = ? + """, (name,)) + + return [dict(row) for row in cursor.fetchall()] + + def get_declarations_by_file(self, file_path: str) -> List[Dict[str, Any]]: + """ + Get all declarations in a specific file. + + Args: + file_path: The file path + + Returns: + List of dictionaries containing declaration information + """ + cursor = self.conn.cursor() + cursor.execute(""" + SELECT * FROM declarations + WHERE file_path = ? + ORDER BY line + """, (file_path,)) + + return [dict(row) for row in cursor.fetchall()] + + def get_declarations_by_module(self, module_name: str) -> List[Dict[str, Any]]: + """ + Get all declarations in a specific module. + + Args: + module_name: The module name + + Returns: + List of dictionaries containing declaration information + """ + cursor = self.conn.cursor() + cursor.execute(""" + SELECT * FROM declarations + WHERE module_name = ? + """, (module_name,)) + + return [dict(row) for row in cursor.fetchall()] + + def get_dependencies(self, decl_id: str) -> List[Dict[str, Any]]: + """ + Get all declarations that this declaration depends on. + + Args: + decl_id: The declaration ID + + Returns: + List of declarations that this declaration depends on + """ + cursor = self.conn.cursor() + cursor.execute(""" + SELECT d.* FROM declarations d + JOIN declaration_dependencies dd ON dd.to_decl_id = d.decl_id + WHERE dd.from_decl_id = ? + """, (decl_id,)) + + return [dict(row) for row in cursor.fetchall()] + + def get_dependents(self, decl_id: str) -> List[Dict[str, Any]]: + """ + Get all declarations that depend on this declaration. + + Args: + decl_id: The declaration ID + + Returns: + List of declarations that depend on this one + """ + cursor = self.conn.cursor() + cursor.execute(""" + SELECT d.* FROM declarations d + JOIN declaration_dependencies dd ON dd.from_decl_id = d.decl_id + WHERE dd.to_decl_id = ? + """, (decl_id,)) + + return [dict(row) for row in cursor.fetchall()] + + def get_dependency_graph( + self, + decl_id: str, + max_depth: int = 10, + direction: str = 'dependencies' + ) -> Dict[str, Any]: + """ + Get the dependency graph for a declaration (recursive). + + Args: + decl_id: The declaration ID to start from + max_depth: Maximum depth to traverse (default 10) + direction: 'dependencies' (what this depends on) or 'dependents' (what depends on this) + + Returns: + Dictionary containing the dependency graph + """ + visited = set() + + def _get_graph_recursive(current_decl_id: str, depth: int) -> Optional[Dict[str, Any]]: + if depth > max_depth or current_decl_id in visited: + return None + + visited.add(current_decl_id) + + decl = self.get_declaration_by_decl_id(current_decl_id) + if not decl: + return None + + if direction == 'dependencies': + related = self.get_dependencies(current_decl_id) + else: + related = self.get_dependents(current_decl_id) + + result = { + 'declaration': decl, + direction: [] + } + + for rel_decl in related: + sub_graph = _get_graph_recursive(rel_decl['decl_id'], depth + 1) + if sub_graph: + result[direction].append(sub_graph) + + return result + + graph = _get_graph_recursive(decl_id, 0) + return graph if graph else {} + + def search_declarations( + self, + name: Optional[str] = None, + namespace: Optional[str] = None, + file_path: Optional[str] = None, + module_name: Optional[str] = None, + decl_type: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + Search declarations with multiple optional filters. + + Args: + name: Declaration name (optional) + namespace: Namespace (optional) + file_path: File path (optional) + module_name: Module name (optional) + decl_type: Declaration type (optional) + + Returns: + List of matching declarations + """ + cursor = self.conn.cursor() + + query = "SELECT * FROM declarations WHERE 1=1" + params = [] + + if name is not None: + query += " AND name = ?" + params.append(name) + + if namespace is not None: + query += " AND namespace = ?" + params.append(namespace) + + if file_path is not None: + query += " AND file_path = ?" + params.append(file_path) + + if module_name is not None: + query += " AND module_name = ?" + params.append(module_name) + + if decl_type is not None: + query += " AND decl_type = ?" + params.append(decl_type) + + cursor.execute(query, params) + return [dict(row) for row in cursor.fetchall()] + + def get_statistics(self) -> Dict[str, int]: + """ + Get database statistics. + + Returns: + Dictionary with counts of files, declarations, and dependencies + """ + cursor = self.conn.cursor() + + stats = {} + + cursor.execute("SELECT COUNT(*) FROM files") + stats['total_files'] = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM declarations") + stats['total_declarations'] = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM declaration_dependencies") + stats['total_dependencies'] = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM declarations WHERE file_path IS NULL") + stats['unresolved_declarations'] = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM imports") + stats['total_imports'] = cursor.fetchone()[0] + + return stats + + def close(self): + """Close the database connection.""" + if self.conn: + self.conn.close() + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.close() + return False + + +if __name__ == "__main__": + # Example usage + print("Creating test database...") + + with LeanDeclarationDB("test_lean_declarations.db") as db: + # Test with file and imports + print("\nTesting file and imports insertion:") + test_imports = [ + { + "end_pos": 292, + "module_name": "Mathlib.Algebra.Group.Basic", + "start_pos": 258, + "text": "import Mathlib.Algebra.Group.Basic" + }, + { + "end_pos": 321, + "module_name": "Mathlib.Tactic.Common", + "start_pos": 293, + "text": "import Mathlib.Tactic.Common" + } + ] + + db.insert_file_imports( + file_path="Mathlib/Algebra/Divisibility/Basic.lean", + module_name="Mathlib.Algebra.Divisibility.Basic", + imports=test_imports + ) + print("Inserted file and imports") + + # Test ID generation + print("\nTesting ID generation:") + decl_id1 = db.get_or_create_decl_id( + name="dvd_trans", + namespace=None, + file_path="Mathlib/Algebra/Divisibility/Basic.lean", + module_name="Mathlib.Algebra.Divisibility.Basic" + ) + print(f"Generated ID for dvd_trans: {decl_id1}") + + # Get same ID again (should return existing) + decl_id2 = db.get_or_create_decl_id( + name="dvd_trans", + namespace=None, + file_path="Mathlib/Algebra/Divisibility/Basic.lean", + module_name="Mathlib.Algebra.Divisibility.Basic" + ) + print(f"Retrieved ID for dvd_trans: {decl_id2}") + print(f"IDs match: {decl_id1 == decl_id2}") + + # Update with full info + db.upsert_declaration_full_info( + decl_id=decl_id1, + name="dvd_trans", + namespace=None, + file_path="Mathlib/Algebra/Divisibility/Basic.lean", + module_name="Mathlib.Algebra.Divisibility.Basic", + decl_type="theorem", + text="@[trans]\ntheorem dvd_trans : a # b � b # c � a # c", + line=63, + column=0, + end_line=68, + end_column=0, + doc_string=None, + proof="| �d, h��, �e, h�� => �d * e, h� � h�.trans <| mul_assoc a d e�" + ) + print("Updated declaration with full info") + + # Create a dependency + dep_id = db.get_or_create_decl_id( + name="mul_assoc", + namespace=None, + file_path=None, + module_name=None + ) + print(f"\nGenerated ID for dependency mul_assoc: {dep_id}") + + # Insert dependency edge + db.insert_dependency_edge(decl_id1, dep_id) + print("Inserted dependency edge") + + # Query + print("\nQuerying declaration:") + decl = db.get_declaration_by_decl_id(decl_id1) + if decl: + print(f"Declaration: {decl['name']} ({decl['decl_type']})") + else: + print("Declaration not found!") + + print("\nQuerying dependencies:") + deps = db.get_dependencies(decl_id1) + print(f"Dependencies: {[d['name'] for d in deps]}") + + # Statistics + print("\nDatabase statistics:") + stats = db.get_statistics() + for key, value in stats.items(): + print(f" {key}: {value}") + + print("\nTest complete!") From 28d9ca1ce1f6fe3866d24e03bc4f212fd1dca1e2 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Sat, 15 Nov 2025 07:08:51 +0000 Subject: [PATCH 04/25] Added basic app for data exploration. And mathlib data extraction yaml. --- src/app/__init__.py | 0 src/app/data_explorer/README.md | 125 ++++++ src/app/data_explorer/__init__.py | 1 + src/app/data_explorer/db_utils.py | 197 +++++++++ src/app/data_explorer/graph_utils.py | 296 ++++++++++++++ src/app/data_explorer/lean_db_explorer.py | 386 ++++++++++++++++++ .../benchmark/mathlib_benchmark_lean_ext.yaml | 40 ++ src/itp_interface/tools/repl | 2 +- 8 files changed, 1046 insertions(+), 1 deletion(-) create mode 100644 src/app/__init__.py create mode 100644 src/app/data_explorer/README.md create mode 100644 src/app/data_explorer/__init__.py create mode 100644 src/app/data_explorer/db_utils.py create mode 100644 src/app/data_explorer/graph_utils.py create mode 100644 src/app/data_explorer/lean_db_explorer.py create mode 100644 src/itp_interface/main/configs/benchmark/mathlib_benchmark_lean_ext.yaml diff --git a/src/app/__init__.py b/src/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/app/data_explorer/README.md b/src/app/data_explorer/README.md new file mode 100644 index 0000000..744ae1e --- /dev/null +++ b/src/app/data_explorer/README.md @@ -0,0 +1,125 @@ +# Lean Declaration Database Explorer + +A Streamlit web application for exploring and analyzing Lean declaration databases created by the ITP Interface data extraction tool. + +## Features + +- **Custom SQL Queries**: Write and execute custom SQL queries with pre-built query templates +- **Declaration Search**: Search declarations by name, type, file path, and namespace +- **Dependency Explorer**: Visualize dependencies and dependents of declarations +- **Forest Analysis**: Analyze connected components in the dependency graph + +## Installation + +Install the required dependencies using pip with the `app` extra: + +```bash +pip install -e ".[app]" +``` + +This will install: +- streamlit +- plotly +- networkx +- pandas + +## Usage + +1. **Generate a database** using the data extraction transform: + ```python + from itp_interface.tools.lean4_local_data_extraction_transform import Local4DataExtractionTransform + + transform = Local4DataExtractionTransform( + buffer_size=1000, + db_path="lean_declarations.db" + ) + # ... run your extraction ... + ``` + +2. **Launch the explorer**: + ```bash + cd src/app/data_explorer + streamlit run lean_db_explorer.py + ``` + +3. **Load your database**: + - Enter the path to your `.db` file in the sidebar + - Click "Load Database" + - Explore using the tabs! + +## App Tabs + +### 🔍 Custom Query +Write and execute custom SQL queries against the database. Includes pre-built queries for common tasks: +- Show all files +- Show all declarations +- Count declarations by type +- Find most depended-on declarations +- And more... + +### 🔎 Search +Search declarations with filters: +- Name pattern (supports partial matching) +- File path +- Declaration type (theorem, def, axiom, etc.) +- Namespace + +### 🌲 Dependencies +Explore dependency relationships: +- **Show Dependencies**: What does this declaration depend on? +- **Show Dependents**: What depends on this declaration? +- **Configurable depth**: Control how deep to traverse +- **Dual view**: Table and interactive graph visualization + +### 🌳 Forests +Analyze connected components in the dependency graph: +- **Find All Forests**: Discover all connected components +- **Statistics**: See forest sizes, roots, and leaves +- **Visualization**: View selected forests as graphs + +## Database Schema + +The app expects a SQLite database with the following tables: + +- `files`: File metadata +- `imports`: Import relationships +- `declarations`: All declarations with full info +- `declaration_dependencies`: Dependency edges (from_decl_id → to_decl_id) + +See `src/itp_interface/tools/simple_sqlite.py` for the complete schema. + +## Tips + +- **Large graphs**: Forest visualization is limited to 100 nodes for performance +- **Query results**: All query results can be downloaded as CSV +- **Unresolved declarations**: Declarations with `file_path IS NULL` are unresolved (from dependencies) + +## Troubleshooting + +**Database not loading?** +- Check the file path is correct +- Ensure the database was created with the correct schema + +**Graph visualization slow?** +- Reduce the max depth for dependency exploration +- Use filters in Search to narrow results + +**Import errors?** +- Ensure you've installed the app dependencies: `pip install -e ".[app]"` +- Run from the correct directory: `cd src/app/data_explorer` + +## Development + +File structure: +``` +src/app/data_explorer/ +├── lean_db_explorer.py # Main Streamlit app +├── db_utils.py # Database query utilities +├── graph_utils.py # Graph analysis and visualization +└── README.md # This file +``` + +To modify: +1. Edit the Python files +2. Streamlit will auto-reload on file changes +3. Refresh browser to see updates diff --git a/src/app/data_explorer/__init__.py b/src/app/data_explorer/__init__.py new file mode 100644 index 0000000..c35c013 --- /dev/null +++ b/src/app/data_explorer/__init__.py @@ -0,0 +1 @@ +"""Lean Declaration Database Explorer package.""" diff --git a/src/app/data_explorer/db_utils.py b/src/app/data_explorer/db_utils.py new file mode 100644 index 0000000..6ab2635 --- /dev/null +++ b/src/app/data_explorer/db_utils.py @@ -0,0 +1,197 @@ +""" +Database utility functions for the Lean Declaration DB Explorer. +""" + +import sys +from pathlib import Path + +# Add parent directories to path to import from itp_interface +root_dir = Path(__file__).parent.parent.parent.parent +if str(root_dir) not in sys.path: + sys.path.insert(0, str(root_dir)) + +import pandas as pd +from typing import Dict, List, Any, Optional +from itp_interface.tools.simple_sqlite import LeanDeclarationDB + + +def execute_query(db: LeanDeclarationDB, query: str) -> pd.DataFrame: + """ + Execute a SQL query and return results as a DataFrame. + + Args: + db: LeanDeclarationDB instance + query: SQL query string + + Returns: + DataFrame with query results + """ + cursor = db.conn.cursor() + cursor.execute(query) + rows = cursor.fetchall() + + if rows: + # Get column names + columns = [description[0] for description in cursor.description] + # Convert to DataFrame + return pd.DataFrame([dict(row) for row in rows], columns=columns) + else: + return pd.DataFrame() + + +def get_common_queries() -> Dict[str, str]: + """ + Return a dictionary of common pre-built queries. + + Returns: + Dict mapping query name to SQL query string + """ + return { + "Show all files": """ + SELECT file_path, module_name + FROM files + ORDER BY file_path + """, + "Show all declarations": """ + SELECT name, namespace, decl_type, file_path, module_name + FROM declarations + WHERE file_path IS NOT NULL + ORDER BY file_path, line + LIMIT 100 + """, + "Count declarations by type": """ + SELECT decl_type, COUNT(*) as count + FROM declarations + WHERE decl_type IS NOT NULL + GROUP BY decl_type + ORDER BY count DESC + """, + "Count declarations by file": """ + SELECT file_path, COUNT(*) as count + FROM declarations + WHERE file_path IS NOT NULL + GROUP BY file_path + ORDER BY count DESC + LIMIT 20 + """, + "Show unresolved declarations": """ + SELECT name, namespace + FROM declarations + WHERE file_path IS NULL + LIMIT 100 + """, + "Show imports for all files": """ + SELECT f.file_path, i.module_name as import_module, i.text + FROM files f + JOIN imports i ON i.file_id = f.id + ORDER BY f.file_path + """, + "Show most depended-on declarations": """ + SELECT d.name, d.namespace, d.file_path, COUNT(*) as dependents_count + FROM declarations d + JOIN declaration_dependencies dd ON dd.to_decl_id = d.decl_id + GROUP BY d.decl_id + ORDER BY dependents_count DESC + LIMIT 20 + """, + "Show declarations with most dependencies": """ + SELECT d.name, d.namespace, d.file_path, COUNT(*) as dependencies_count + FROM declarations d + JOIN declaration_dependencies dd ON dd.from_decl_id = d.decl_id + GROUP BY d.decl_id + ORDER BY dependencies_count DESC + LIMIT 20 + """ + } + + +def search_declarations( + db: LeanDeclarationDB, + name_pattern: Optional[str] = None, + namespace: Optional[str] = None, + file_path: Optional[str] = None, + decl_type: Optional[str] = None +) -> pd.DataFrame: + """ + Search declarations with filters. + + Args: + db: LeanDeclarationDB instance + name_pattern: Name pattern (SQL LIKE syntax) + namespace: Namespace filter + file_path: File path filter + decl_type: Declaration type filter + + Returns: + DataFrame with matching declarations + """ + query = "SELECT * FROM declarations WHERE 1=1" + params = [] + + if name_pattern: + query += " AND name LIKE ?" + params.append(f"%{name_pattern}%") + + if namespace: + query += " AND namespace = ?" + params.append(namespace) + + if file_path: + query += " AND file_path LIKE ?" + params.append(f"%{file_path}%") + + if decl_type: + query += " AND decl_type = ?" + params.append(decl_type) + + query += " LIMIT 1000" + + cursor = db.conn.cursor() + cursor.execute(query, params) + rows = cursor.fetchall() + + if rows: + columns = [description[0] for description in cursor.description] + return pd.DataFrame([dict(row) for row in rows], columns=columns) + else: + return pd.DataFrame() + + +def get_all_declaration_names(db: LeanDeclarationDB) -> List[str]: + """ + Get all unique declaration names for autocomplete. + + Args: + db: LeanDeclarationDB instance + + Returns: + List of declaration names + """ + cursor = db.conn.cursor() + cursor.execute(""" + SELECT DISTINCT name + FROM declarations + WHERE file_path IS NOT NULL + ORDER BY name + """) + return [row[0] for row in cursor.fetchall()] + + +def get_declaration_types(db: LeanDeclarationDB) -> List[str]: + """ + Get all unique declaration types. + + Args: + db: LeanDeclarationDB instance + + Returns: + List of declaration types + """ + cursor = db.conn.cursor() + cursor.execute(""" + SELECT DISTINCT decl_type + FROM declarations + WHERE decl_type IS NOT NULL + ORDER BY decl_type + """) + return [row[0] for row in cursor.fetchall()] diff --git a/src/app/data_explorer/graph_utils.py b/src/app/data_explorer/graph_utils.py new file mode 100644 index 0000000..5d0ed40 --- /dev/null +++ b/src/app/data_explorer/graph_utils.py @@ -0,0 +1,296 @@ +""" +Graph analysis utilities for dependency visualization and forest detection. +""" + +import sys +from pathlib import Path + +# Add parent directories to path +root_dir = Path(__file__).parent.parent.parent.parent +if str(root_dir) not in sys.path: + sys.path.insert(0, str(root_dir)) + +import networkx as nx +import plotly.graph_objects as go +from typing import List, Set, Dict, Tuple, Any +from itp_interface.tools.simple_sqlite import LeanDeclarationDB + + +def build_dependency_graph(db: LeanDeclarationDB) -> nx.DiGraph: + """ + Build a directed graph from the dependency relationships. + + Args: + db: LeanDeclarationDB instance + + Returns: + NetworkX directed graph where edges point from dependent to dependency + (A -> B means A depends on B) + """ + G = nx.DiGraph() + + # Add all declarations as nodes + cursor = db.conn.cursor() + cursor.execute(""" + SELECT decl_id, name, namespace, decl_type, file_path + FROM declarations + """) + + for row in cursor.fetchall(): + decl_id, name, namespace, decl_type, file_path = row + full_name = f"{namespace}.{name}" if namespace else name + G.add_node(decl_id, name=name, full_name=full_name, + namespace=namespace, decl_type=decl_type, + file_path=file_path) + + # Add all dependency edges + cursor.execute(""" + SELECT from_decl_id, to_decl_id + FROM declaration_dependencies + """) + + for from_id, to_id in cursor.fetchall(): + G.add_edge(from_id, to_id) + + return G + + +def find_all_forests(G: nx.DiGraph) -> List[Set[str]]: + """ + Find all connected components (forests) in the graph. + + Args: + G: NetworkX directed graph + + Returns: + List of sets, where each set contains decl_ids in a connected component + """ + # Convert to undirected for finding connected components + G_undirected = G.to_undirected() + components = list(nx.connected_components(G_undirected)) + + # Sort by size (largest first) + components.sort(key=len, reverse=True) + + return components + + +def get_dependencies_closure( + db: LeanDeclarationDB, + decl_id: str, + max_depth: int = 10 +) -> Tuple[List[Dict[str, Any]], nx.DiGraph]: + """ + Get all dependencies (transitive) of a declaration. + + Args: + db: LeanDeclarationDB instance + decl_id: Starting declaration ID + max_depth: Maximum depth to traverse + + Returns: + Tuple of (list of declaration dicts, subgraph) + """ + G = build_dependency_graph(db) + + if decl_id not in G: + return [], nx.DiGraph() + + # Get all reachable nodes (BFS limited by depth) + visited = set() + queue = [(decl_id, 0)] + visited.add(decl_id) + + while queue: + current, depth = queue.pop(0) + if depth >= max_depth: + continue + + for neighbor in G.successors(current): + if neighbor not in visited: + visited.add(neighbor) + queue.append((neighbor, depth + 1)) + + # Create subgraph + subgraph = G.subgraph(visited).copy() + + # Get declaration info for all nodes + decls = [] + for node_id in visited: + decl = db.get_declaration_by_decl_id(node_id) + if decl: + decls.append(decl) + + return decls, subgraph + + +def get_dependents_closure( + db: LeanDeclarationDB, + decl_id: str, + max_depth: int = 10 +) -> Tuple[List[Dict[str, Any]], nx.DiGraph]: + """ + Get all dependents (transitive) of a declaration. + + Args: + db: LeanDeclarationDB instance + decl_id: Starting declaration ID + max_depth: Maximum depth to traverse + + Returns: + Tuple of (list of declaration dicts, subgraph) + """ + G = build_dependency_graph(db) + + if decl_id not in G: + return [], nx.DiGraph() + + # Get all nodes that can reach this node (reverse BFS) + visited = set() + queue = [(decl_id, 0)] + visited.add(decl_id) + + while queue: + current, depth = queue.pop(0) + if depth >= max_depth: + continue + + for neighbor in G.predecessors(current): + if neighbor not in visited: + visited.add(neighbor) + queue.append((neighbor, depth + 1)) + + # Create subgraph + subgraph = G.subgraph(visited).copy() + + # Get declaration info for all nodes + decls = [] + for node_id in visited: + decl = db.get_declaration_by_decl_id(node_id) + if decl: + decls.append(decl) + + return decls, subgraph + + +def visualize_graph(G: nx.DiGraph, title: str = "Dependency Graph") -> go.Figure: + """ + Create an interactive Plotly visualization of the graph. + + Args: + G: NetworkX directed graph + title: Title for the plot + + Returns: + Plotly Figure object + """ + if len(G.nodes()) == 0: + # Empty graph + fig = go.Figure() + fig.update_layout(title=title, annotations=[ + dict(text="No dependencies found", showarrow=False, + xref="paper", yref="paper", x=0.5, y=0.5) + ]) + return fig + + # Use spring layout for positioning + pos = nx.spring_layout(G, k=1, iterations=50) + + # Create edge traces + edge_x = [] + edge_y = [] + for edge in G.edges(): + x0, y0 = pos[edge[0]] + x1, y1 = pos[edge[1]] + edge_x.extend([x0, x1, None]) + edge_y.extend([y0, y1, None]) + + edge_trace = go.Scatter( + x=edge_x, y=edge_y, + line=dict(width=0.5, color='#888'), + hoverinfo='none', + mode='lines') + + # Create node traces + node_x = [] + node_y = [] + node_text = [] + node_color = [] + + # Color map for declaration types + type_colors = { + 'theorem': '#FF6B6B', + 'def': '#4ECDC4', + 'axiom': '#45B7D1', + 'instance': '#FFA07A', + 'class': '#98D8C8', + None: '#95A5A6' + } + + for node in G.nodes(): + x, y = pos[node] + node_x.append(x) + node_y.append(y) + + # Get node info + node_data = G.nodes[node] + name = node_data.get('full_name', node_data.get('name', 'Unknown')) + decl_type = node_data.get('decl_type') + file_path = node_data.get('file_path', 'Unknown') + + node_text.append(f"{name}
Type: {decl_type}
File: {file_path}") + node_color.append(type_colors.get(decl_type, type_colors[None])) + + node_trace = go.Scatter( + x=node_x, y=node_y, + mode='markers', + hoverinfo='text', + text=node_text, + marker=dict( + showscale=False, + color=node_color, + size=10, + line_width=2)) + + # Create figure + fig = go.Figure(data=[edge_trace, node_trace], + layout=go.Layout( + title=title, + titlefont_size=16, + showlegend=False, + hovermode='closest', + margin=dict(b=0, l=0, r=0, t=40), + xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)) + ) + + return fig + + +def get_forest_statistics(G: nx.DiGraph, forest: Set[str]) -> Dict[str, Any]: + """ + Get statistics for a forest (connected component). + + Args: + G: NetworkX directed graph + forest: Set of decl_ids in the forest + + Returns: + Dictionary with forest statistics + """ + subgraph = G.subgraph(forest) + + # Find root nodes (nodes with no incoming edges from within forest) + roots = [node for node in forest if subgraph.in_degree(node) == 0] + + # Find leaf nodes (nodes with no outgoing edges from within forest) + leaves = [node for node in forest if subgraph.out_degree(node) == 0] + + return { + 'size': len(forest), + 'num_edges': subgraph.number_of_edges(), + 'num_roots': len(roots), + 'num_leaves': len(leaves), + 'roots': [G.nodes[r].get('full_name', r) for r in roots[:5]], # First 5 roots + 'leaves': [G.nodes[l].get('full_name', l) for l in leaves[:5]] # First 5 leaves + } diff --git a/src/app/data_explorer/lean_db_explorer.py b/src/app/data_explorer/lean_db_explorer.py new file mode 100644 index 0000000..9c9eae4 --- /dev/null +++ b/src/app/data_explorer/lean_db_explorer.py @@ -0,0 +1,386 @@ +""" +Lean Declaration Database Explorer + +A Streamlit web app for exploring and analyzing Lean declaration databases. +""" + +import sys +from pathlib import Path + +# Add parent directories to path +root_dir = Path(__file__).parent.parent.parent.parent +if str(root_dir) not in sys.path: + sys.path.insert(0, str(root_dir)) + +import streamlit as st +import pandas as pd +from itp_interface.tools.simple_sqlite import LeanDeclarationDB +import db_utils +import graph_utils + + +# Page configuration +st.set_page_config( + page_title="Lean Declaration DB Explorer", + page_icon="📊", + layout="wide", + initial_sidebar_state="expanded" +) + + +def load_database(db_path: str) -> LeanDeclarationDB: + """Load database and cache in session state.""" + try: + db = LeanDeclarationDB(db_path) + return db + except Exception as e: + st.error(f"Failed to load database: {e}") + return None + + +def display_statistics(db: LeanDeclarationDB): + """Display database statistics in the sidebar.""" + stats = db.get_statistics() + + st.sidebar.markdown("### 📊 Database Statistics") + col1, col2 = st.sidebar.columns(2) + + with col1: + st.metric("Files", stats.get('total_files', 0)) + st.metric("Declarations", stats.get('total_declarations', 0)) + + with col2: + st.metric("Dependencies", stats.get('total_dependencies', 0)) + st.metric("Imports", stats.get('total_imports', 0)) + + st.sidebar.metric("Unresolved", stats.get('unresolved_declarations', 0)) + + +def tab_custom_query(db: LeanDeclarationDB): + """Custom Query tab interface.""" + st.header("🔍 Custom SQL Query") + + # Example queries dropdown + st.markdown("**Pre-built Queries:**") + common_queries = db_utils.get_common_queries() + query_name = st.selectbox( + "Select a query to load", + [""] + list(common_queries.keys()), + label_visibility="collapsed" + ) + + # Query text area - use query_name as part of key to force refresh + if query_name: + default_query = common_queries.get(query_name) + else: + default_query = "SELECT * FROM declarations LIMIT 10" + + query = st.text_area( + "SQL Query", + value=default_query, + height=150, + key=f"custom_query_{query_name}" + ) + + col1, col2 = st.columns([1, 5]) + with col1: + execute_button = st.button("Execute Query", type="primary") + + if execute_button: + try: + with st.spinner("Executing query..."): + df = db_utils.execute_query(db, query) + + if not df.empty: + st.success(f"Query returned {len(df)} rows") + st.dataframe(df, use_container_width=True, height=500) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="Download CSV", + data=csv, + file_name="query_results.csv", + mime="text/csv" + ) + else: + st.info("Query returned no results") + + except Exception as e: + st.error(f"Query error: {e}") + + +def tab_search(db: LeanDeclarationDB): + """Search & Browse tab interface.""" + st.header("🔎 Search Declarations") + + col1, col2 = st.columns(2) + + with col1: + name_pattern = st.text_input("Name pattern", placeholder="e.g., add, mul, theorem") + file_path = st.text_input("File path contains", placeholder="e.g., Mathlib/Data") + + with col2: + decl_types = [""] + db_utils.get_declaration_types(db) + decl_type = st.selectbox("Declaration type", decl_types) + + namespace = st.text_input("Namespace", placeholder="e.g., Nat, List") + + if st.button("Search", type="primary"): + with st.spinner("Searching..."): + df = db_utils.search_declarations( + db, + name_pattern=name_pattern if name_pattern else None, + namespace=namespace if namespace else None, + file_path=file_path if file_path else None, + decl_type=decl_type if decl_type else None + ) + + if not df.empty: + st.success(f"Found {len(df)} declarations") + + # Show results + st.dataframe(df, use_container_width=True, height=500) + + # Download button + csv = df.to_csv(index=False) + st.download_button( + label="Download Results", + data=csv, + file_name="search_results.csv", + mime="text/csv" + ) + else: + st.info("No declarations found matching the criteria") + + +def tab_dependencies(db: LeanDeclarationDB): + """Dependency Explorer tab interface.""" + st.header("🌲 Dependency Explorer") + + # Declaration selector + st.markdown("**Select a declaration:**") + decl_name = st.text_input("Declaration name", placeholder="e.g., dvd_trans, Nat.add") + + col1, col2, col3 = st.columns(3) + + with col1: + show_deps = st.button("Show Dependencies", type="primary") + with col2: + show_dependents = st.button("Show Dependents", type="primary") + with col3: + max_depth = st.number_input("Max depth", min_value=1, max_value=20, value=5) + + if show_deps or show_dependents: + if not decl_name: + st.warning("Please enter a declaration name") + return + + # Find declaration by name + search_df = db_utils.search_declarations(db, name_pattern=decl_name) + + if search_df.empty: + st.error(f"Declaration '{decl_name}' not found") + return + + if len(search_df) > 1: + st.warning(f"Found {len(search_df)} declarations with this name. Using the first one.") + st.dataframe(search_df[['name', 'namespace', 'file_path', 'decl_type']]) + + decl_id = search_df.iloc[0]['decl_id'] + + with st.spinner("Analyzing dependencies..."): + if show_deps: + decls, subgraph = graph_utils.get_dependencies_closure(db, decl_id, max_depth) + title = f"Dependencies of {decl_name}" + else: + decls, subgraph = graph_utils.get_dependents_closure(db, decl_id, max_depth) + title = f"Dependents of {decl_name}" + + if decls: + st.success(f"Found {len(decls)} related declarations") + + # Show tabs for table and graph views + tab1, tab2 = st.tabs(["📊 Table View", "📈 Graph View"]) + + with tab1: + df = pd.DataFrame(decls) + st.dataframe(df[['name', 'namespace', 'decl_type', 'file_path', 'line']], + use_container_width=True, height=400) + + with tab2: + fig = graph_utils.visualize_graph(subgraph, title) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("No dependencies found") + + +def tab_forests(db: LeanDeclarationDB): + """Forest Analysis tab interface.""" + st.header("🌳 Forest Analysis") + + st.markdown(""" + A **forest** is a connected component in the dependency graph. Declarations in the same + forest are connected through dependency relationships. + """) + + # Initialize session state for forests + if 'forests' not in st.session_state: + st.session_state.forests = None + st.session_state.forest_graph = None + + if st.button("Find All Forests", type="primary"): + with st.spinner("Analyzing dependency graph..."): + G = graph_utils.build_dependency_graph(db) + forests = graph_utils.find_all_forests(G) + # Store in session state + st.session_state.forests = forests + st.session_state.forest_graph = G + + st.success(f"Found {len(forests)} forests") + + # Use forests from session state + forests = st.session_state.forests + G = st.session_state.forest_graph + + if forests is not None: + # Display statistics + st.markdown("### Forest Statistics") + + forest_data = [] + for i, forest in enumerate(forests[:20]): # Limit to top 20 + stats = graph_utils.get_forest_statistics(G, forest) + forest_data.append({ + 'Forest ID': i + 1, + 'Size': stats['size'], + 'Edges': stats['num_edges'], + 'Roots': stats['num_roots'], + 'Leaves': stats['num_leaves'], + 'Sample Roots': ', '.join(stats['roots'][:3]) + }) + + df = pd.DataFrame(forest_data) + st.dataframe(df, use_container_width=True) + + # Show declarations in selected forest + st.markdown("### View Forest Declarations") + forest_id = st.number_input( + "Forest ID to view", + min_value=1, + max_value=len(forests), + value=1 + ) + + if st.button("Show Declarations in Forest"): + forest = forests[forest_id - 1] + st.info(f"Forest #{forest_id} contains {len(forest)} declarations") + + # Get all declarations in this forest + forest_decls = [] + for decl_id in forest: + decl = db.get_declaration_by_decl_id(decl_id) + if decl: + # Truncate text and proof for display + text = decl.get('text', '') + proof = decl.get('proof', '') + forest_decls.append({ + 'decl_id': decl['decl_id'], + 'name': decl['name'], + 'namespace': decl.get('namespace', ''), + 'decl_type': decl.get('decl_type', ''), + 'text': text[:100] + '...' if text and len(text) > 100 else text, + 'proof': proof[:100] + '...' if proof and len(proof) > 100 else proof, + 'file_path': decl.get('file_path', ''), + 'line': decl.get('line', '') + }) + + forest_df = pd.DataFrame(forest_decls) + st.dataframe(forest_df, use_container_width=True, height=500) + + # Download button + csv = forest_df.to_csv(index=False) + st.download_button( + label="Download Forest Declarations", + data=csv, + file_name=f"forest_{forest_id}_declarations.csv", + mime="text/csv" + ) + else: + st.info("Click 'Find All Forests' to analyze the dependency graph") + + +def main(): + """Main application.""" + st.title("📊 Lean Declaration Database Explorer") + + # Sidebar + st.sidebar.title("Database Connection") + + # Database path input + db_path = st.sidebar.text_input( + "Database Path", + value="lean_declarations.db", + help="Path to the SQLite database file" + ) + + load_button = st.sidebar.button("Load Database", type="primary") + + # Initialize session state + if 'db' not in st.session_state: + st.session_state.db = None + + if load_button: + if Path(db_path).exists(): + with st.spinner("Loading database..."): + st.session_state.db = load_database(db_path) + + if st.session_state.db: + st.sidebar.success("Database loaded successfully!") + else: + st.sidebar.error(f"Database file not found: {db_path}") + + # Show statistics if database is loaded + if st.session_state.db: + display_statistics(st.session_state.db) + + # Main content tabs + tab1, tab2, tab3, tab4 = st.tabs([ + "🔍 Custom Query", + "🔎 Search", + "🌲 Dependencies", + "🌳 Forests" + ]) + + with tab1: + tab_custom_query(st.session_state.db) + + with tab2: + tab_search(st.session_state.db) + + with tab3: + tab_dependencies(st.session_state.db) + + with tab4: + tab_forests(st.session_state.db) + + else: + st.info("👈 Please load a database to get started") + + st.markdown(""" + ### Getting Started + + 1. Enter the path to your SQLite database file in the sidebar + 2. Click "Load Database" + 3. Explore your data using the tabs above + + ### Features + + - **Custom Query**: Write and execute custom SQL queries + - **Search**: Search declarations by name, type, file, etc. + - **Dependencies**: Explore dependency relationships + - **Forests**: Analyze connected components in the dependency graph + """) + + +if __name__ == "__main__": + main() diff --git a/src/itp_interface/main/configs/benchmark/mathlib_benchmark_lean_ext.yaml b/src/itp_interface/main/configs/benchmark/mathlib_benchmark_lean_ext.yaml new file mode 100644 index 0000000..b2bb43b --- /dev/null +++ b/src/itp_interface/main/configs/benchmark/mathlib_benchmark_lean_ext.yaml @@ -0,0 +1,40 @@ +name: mathlib_benchmark_lean_ext +num_files: 1 +language: LEAN4 +few_shot_data_path_for_retrieval: +few_shot_metadata_filename_for_retrieval: +dfs_data_path_for_retrieval: +dfs_metadata_filename_for_retrieval: +is_extraction_request: true +datasets: + - project: src/data/test/Mathlib + files: [] + exclude_files: + - src/data/test/Mathlib/.lake/packages/aesop + - src/data/test/Mathlib/.lake/packages/batteries + - src/data/test/Mathlib/.lake/packages/Cli + - src/data/test/Mathlib/.lake/packages/importGraph + - src/data/test/Mathlib/.lake/packages/LeanSearchClient + - src/data/test/Mathlib/.lake/packages/plausible + - src/data/test/Mathlib/.lake/packages/proofwidgets + - src/data/test/Mathlib/.lake/packages/Qq + - src/data/test/Mathlib/.lake/packages/mathlib/.devcontainer + - src/data/test/Mathlib/.lake/packages/mathlib/.github + - src/data/test/Mathlib/.lake/packages/mathlib/.docker + - src/data/test/Mathlib/.lake/packages/mathlib/.lake + - src/data/test/Mathlib/.lake/packages/mathlib/.vscode + - src/data/test/Mathlib/.lake/packages/mathlib/Archive + - src/data/test/Mathlib/.lake/packages/mathlib/Cache + - src/data/test/Mathlib/.lake/packages/mathlib/docs + - src/data/test/Mathlib/.lake/packages/mathlib/Counterexamples + - src/data/test/Mathlib/.lake/packages/mathlib/DownstreamTest + - src/data/test/Mathlib/.lake/packages/mathlib/LongestPole + - src/data/test/Mathlib/.lake/packages/mathlib/MathlibTest + - src/data/test/Mathlib/.lake/packages/mathlib/scripts + - src/data/test/Mathlib/.lake/packages/mathlib/widget + - src/data/test/Mathlib/.lake/packages/mathlib/Counterexamples.lean + - src/data/test/Mathlib/.lake/packages/mathlib/lakefile.lean + - src/data/test/Mathlib/.lake/packages/mathlib/Mathlib.lean + - src/data/test/Mathlib/.lake/packages/mathlib/Archive.lean + - src/data/test/Mathlib/lakefile.lean + - src/data/test/Mathlib/ReplMathlibTests.lean diff --git a/src/itp_interface/tools/repl b/src/itp_interface/tools/repl index 8fff855..ebeaf40 160000 --- a/src/itp_interface/tools/repl +++ b/src/itp_interface/tools/repl @@ -1 +1 @@ -Subproject commit 8fff8552292860d349b459d6a811e6915671dc0d +Subproject commit ebeaf409fc93d64d5a52652c87e74bfb64805a66 From 4ef1371d4f4dc36bb9868b9fb26577b2a3481b0e Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Mon, 17 Nov 2025 01:20:31 +0000 Subject: [PATCH 05/25] Made tactic parsing more efficient. --- .../tools/tactic_parser/TacticParser.lean | 1 + .../TacticParser/ProofExtractor.lean | 156 ++++++++++++------ .../TacticParser/SyntaxWalker.lean | 5 + .../TacticParser/TacticExtractorMain.lean | 52 ++++++ .../tools/tactic_parser/lakefile.toml | 7 +- 5 files changed, 170 insertions(+), 51 deletions(-) create mode 100644 src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser.lean b/src/itp_interface/tools/tactic_parser/TacticParser.lean index b32baf9..762a727 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser.lean @@ -2,4 +2,5 @@ import TacticParser.Base64 import TacticParser.Types import TacticParser.SyntaxWalker import TacticParser.Main +import TacticParser.SyntaxWalkerMain import TacticParser.Example.simple diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean index e363bfe..efa21b9 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean @@ -23,26 +23,11 @@ instance : Ord DelimiterCandidate where compare a b := compare a.position b.position /-- Try to parse a text snippet and return true if it parses without errors -/ -unsafe def tryParseSuccessfully (text : String) (cmdState? : Option Command.State): IO Bool := do - -- IO.println s!"Trying to parse text:\n{text}\n---" - -- IO.println s!"With cmdState?: {cmdState?.isSome}" - let (_, _, messages, _) ← try - IO.processInput text cmdState? Options.empty - catch e => - -- IO.println s!"Parsing exception: {e}" - return false - - let mut errorInfos : Array ErrorInfo := #[] - for msg in messages do - if msg.severity == .error then - let msgPos := Position.mk msg.pos.line msg.pos.column - let errorInfo := ErrorInfo.mk (← msg.data.toString) msgPos - errorInfos := errorInfos.push errorInfo - -- IO.println s!"Parsing error: {← msg.data.toString} at {msg.pos}" - if errorInfos.size > 0 then - return false - else - return true +def tryParseSuccessfully (text : String) (cmdState : Command.State): IO Bool := do + let chkpt_result ← parseTactics text none (some cmdState) + let parse_res := chkpt_result.1 + let new_cmd_state := chkpt_result.2 + pure (parse_res.errors.size == 0 ∧ new_cmd_state.isSome) /-- Check if a substring starts with a given substring at a specific position -/ def substringStartsWith (text : String) (startPos : Nat) (substr : String) : Bool := @@ -73,44 +58,42 @@ def findCandidateDelimiters (text : String) : List DelimiterCandidate := { position := pos, delimiterType := "where" } let pipeCandidates := pipePositions.map fun pos => { position := pos, delimiterType := "|" } + let assignCandidatesSorted := assignCandidates.toArray.qsort (fun a b => a.position < b.position) + let whereCandidatesSorted := whereCandidates.toArray.qsort (fun a b => a.position < b.position) + let pipeCandidatesSorted := pipeCandidates.toArray.qsort (fun a b => a.position < b.position) + + let allCandidates := assignCandidatesSorted.toList ++ whereCandidatesSorted.toList ++ pipeCandidatesSorted.toList + allCandidates + +def is_proof_extraction_needed (declInfo : DeclInfo) : Bool := + declInfo.declType == DeclType.theorem || + declInfo.declType == DeclType.lemma || + declInfo.declType == DeclType.example - let allCandidates := assignCandidates ++ whereCandidates ++ pipeCandidates - allCandidates.toArray.qsort (fun a b => a.position < b.position) |>.toList +def get_content_before_decl (fileContent : String) (declLineNum : Nat) : String := + let lines := fileContent.splitOn "\n" + let beforeLines := lines.take (declLineNum - 1) + (String.intercalate "\n" beforeLines) ++ "\n" + +def get_in_between_content (fileContent : String) (startLine : Nat) (endLine : Nat) : String := + let lines := fileContent.splitOn "\n" + let betweenLines := (lines.take endLine).drop (startLine - 1) + String.intercalate "\n" betweenLines /-- Extract proof from a declaration by testing candidate delimiters -/ -unsafe def extractProofFromDecl (declInfo : DeclInfo) (fileContent : String) : IO DeclInfo := do +unsafe def extractProofFromDecl + (declInfo : DeclInfo) + (cmdState : Command.State) : IO DeclInfo := do -- Only process theorem, lemma, example - if declInfo.declType != DeclType.theorem && - declInfo.declType != DeclType.lemma && - declInfo.declType != DeclType.example then - return declInfo + if !is_proof_extraction_needed declInfo then + panic! s!"extractProofFromDecl called on non-proof decl: {declInfo.declType}" let text := declInfo.text - -- Extract context before this declaration (imports + previous declarations) -- Convert Position to byte offset - let mut contextBeforeDecl := - let lines := fileContent.splitOn "\n" - let beforeLines := lines.take (declInfo.startPos.line - 1) - String.intercalate "\n" beforeLines - contextBeforeDecl := contextBeforeDecl ++ "\n" - - -- IO.println s!"contextBeforeDecl:\n{contextBeforeDecl}\n---" - - let mut cmdState? : Option Command.State := none - try - let (initialCmdState, cmdState, messages, trees) ← IO.processInput contextBeforeDecl none Options.empty - cmdState? := some cmdState - catch e => - cmdState? := none - - -- IO.println s!"cmdState?: {cmdState?.isSome}" - -- Find all candidate delimiters let candidates := findCandidateDelimiters text - -- IO.println s!"text:\n{text}\n---" - -- Test each candidate for candidate in candidates do let beforeDelimiter := text.take candidate.position @@ -127,7 +110,7 @@ unsafe def extractProofFromDecl (declInfo : DeclInfo) (fileContent : String) : I -- IO.println s!"statementOnly:\n{statementOnly}\n---" -- Try to parse - let success ← tryParseSuccessfully statementOnly cmdState? + let success ← tryParseSuccessfully statementOnly cmdState if success then -- Found valid split! -- IO.println s!"Found valid split at position {candidate.position} with delimiter {candidate.delimiterType}" @@ -140,12 +123,85 @@ unsafe def extractProofFromDecl (declInfo : DeclInfo) (fileContent : String) : I -- No valid split found - no proof return { declInfo with proof := none } +unsafe def parse_between +(fileContent : String) +(decl: DeclInfo) +(prev: Nat) +(next: Nat) +(cmd_state : Option Command.State) +: IO (Nat × CheckpointedParseResult) := do + -- IO.println s!"Extracting proof for segment between lines {prev} and {next}" + let contextBeforeDecl := get_in_between_content fileContent prev next + -- IO.println s!"Processing declaration at line {decl.startPos.line}-{decl.endPos.line}" + -- IO.println s!"Declaration text:\n{decl.text}\n---" + -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" + let chkpt_parse_res ← parseTactics contextBeforeDecl none cmd_state + return (next, chkpt_parse_res) + /-- Extract proofs from multiple declarations -/ unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String) : IO (Array DeclInfo) := do let mut result := #[] + let mut prev := 0 + let mut cmd_state : Option Command.State := none + let mut next := 0 for decl in decls do - let processed ← extractProofFromDecl decl fileContent - result := result.push processed + if is_proof_extraction_needed decl then + next := decl.startPos.line - 1 + let tup ← parse_between fileContent decl prev next cmd_state + let chkpt_parse_res := tup.2 + -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" + let parse_res := chkpt_parse_res.1 + let mut decl_fixed := decl + if parse_res.errors.size > 0 then + -- we should reparse with 1 less line to avoid partial lines + let last_line_num := decl.startPos.line - 1 + let new_decl := { decl with startPos := { decl.startPos with line := last_line_num } } + next := last_line_num - 1 + -- IO.println s!"Reparsing context before declaration at line {decl.startPos.line} with one less line" + let tup ← parse_between fileContent decl prev next cmd_state + let chkpt_parse_res := tup.2 + let mut missing_decl_text := get_in_between_content fileContent last_line_num last_line_num + missing_decl_text := missing_decl_text.trim + decl_fixed := { new_decl with text := missing_decl_text ++ "\n" ++ decl.text } + if chkpt_parse_res.parseResult.errors.size > 0 then + IO.println s!"Reparsing failed again for context before declaration at line {decl.startPos.line}-{decl.endPos.line}" + IO.println s!"Errors:" + for error in chkpt_parse_res.parseResult.errors do + IO.println s!"Error while parsing context before declaration at line {decl.startPos.line}: + {error.message} at {error.position.line}:{error.position.column}" + IO.println s!"--- Context Before Decl ----\n{get_in_between_content fileContent prev next}\n--- Context Before Decl ----\n" + IO.println s!"Decl text:\n{decl_fixed.text}\n---" + panic! "Failed to parse context before declaration after reparsing" + cmd_state := chkpt_parse_res.2 + prev := next + next := tup.1 + else + cmd_state := chkpt_parse_res.2 + prev := next + next := tup.1 + -- for error in parse_res.errors do + -- IO.println s!"Error while parsing context before declaration at line {decl.startPos.line}: {error.message} at {error.position.line}:{error.position.column}" + -- if parse_res.errors.size > 0 then + -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" + -- IO.println s!"Failed to parse context before declaration at line {decl.startPos.line}" + -- panic! "Failed to parse context" + -- IO.println s!"Context before decl:\n{contextBeforeDecl}\n---" + -- let cmd_state_error_pair ← get_cmd_state contextBeforeDecl cmd_state + -- cmd_state := cmd_state_error_pair.1 + -- let errors := cmd_state_error_pair.2 + -- for err in errors do + -- IO.println s!"Error while updating cmd_state: {err.message} at {err.position.line}:{err.position.column}" + -- if cmd_state.isNone ∨ errors.size > 0 then + -- IO.println s!"Failed to update cmd_state before processing declaration at line {decl.startPos.line}" + -- panic! "Failed to update cmd_state" + -- else + let cmd_st ← match cmd_state with + | some st => pure st + | none => panic! "Failed to get valid cmd_state before processing declaration" + let processed ← extractProofFromDecl decl_fixed cmd_st + result := result.push processed + else + result := result.push decl return result end TacticParser diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean b/src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean index 46689ac..5dd8d3a 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean @@ -308,6 +308,10 @@ unsafe def parseInCurrentContext (input : String) (filePath : Option String := n let extentStruct := lineExtents.map getInfoNodeStruct -- Go over all line extents and reassign the end_pos of the next node let mut adjusted_trees : Array InfoNodeStruct := #[] + -- IO.println s!"Total line extents found: {lineExtents.size}" + if lineExtents.size == 0 then + let parseResult : ParseResult := { trees := #[], errors := errorInfos } + return { parseResult := parseResult, chkptState := cmdState , lineNum := none } for i in [1:lineExtents.size] do let prev_node := extentStruct[i - 1]!.getD default let curr_node := extentStruct[i]!.getD default @@ -350,6 +354,7 @@ unsafe def parseTacticsWithElaboration (input : String) (filePath : Option Strin -- Initialize Lean from current directory (finds .lake/build if present) Lean.initSearchPath (← Lean.findSysroot) Lean.enableInitializersExecution + -- IO.println "Initialized Lean environment for elaboration-based parsing." return ← parseInCurrentContext input filePath chkptState catch e => let errorInfo := ErrorInfo.mk (s!"Error in parseTacticsWithElaboration: {e}") { line := 0, column := 0 } diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean b/src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean new file mode 100644 index 0000000..5c66a6d --- /dev/null +++ b/src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean @@ -0,0 +1,52 @@ +import TacticParser.SyntaxWalker +import Lean + +open Lean +open TacticParser + +/-- Print usage information -/ +def printUsage : IO Unit := do + IO.println "Usage: dependency_parser " + IO.println "" + IO.println "Arguments:" + IO.println " Path to the Lean file to analyze" + IO.println " Path where JSON output will be written" + IO.println "" + IO.println "Example:" + IO.println " lake env .lake/build/bin/syntax-walker MyFile.lean output.json" + +unsafe def main (args : List String) : IO UInt32 := do + match args with + | + [ + leanFilePath, + jsonOutputPath + ] + => + try + let filepath : System.FilePath := leanFilePath + let jsonPath : System.FilePath := jsonOutputPath + + -- Check if input file exists + if !(← filepath.pathExists) then + IO.eprintln s!"Error: Input file not found: {filepath}" + return 1 + + let fileContent ← IO.FS.readFile filepath + + IO.println s!"Tactic Parsing file: {filepath}" + + -- Analyze the file and export to JSON + let tactics ← parseTactics fileContent none none + let tacticsJson := Lean.ToJson.toJson tactics.parseResult + IO.FS.writeFile jsonPath tacticsJson.compress + return 0 + catch e => + IO.eprintln s!"Error: {e}" + return 1 + + | _ => + IO.eprintln "Error: Invalid number of arguments" + IO.eprintln "" + printUsage + return 1 diff --git a/src/itp_interface/tools/tactic_parser/lakefile.toml b/src/itp_interface/tools/tactic_parser/lakefile.toml index 45d2d4d..e9c3958 100644 --- a/src/itp_interface/tools/tactic_parser/lakefile.toml +++ b/src/itp_interface/tools/tactic_parser/lakefile.toml @@ -1,5 +1,5 @@ name = "TacticParser" -defaultTargets = ["tactic-parser", "dependency-parser"] +defaultTargets = ["tactic-parser", "tactic-extractor", "dependency-parser"] [[lean_lib]] name = "TacticParser" @@ -12,6 +12,11 @@ name = "tactic-parser" root = "TacticParser.Main" supportInterpreter = true +[[lean_exe]] +name = "tactic-extractor" +root = "TacticParser.TacticExtractorMain" +supportInterpreter = true + [[lean_exe]] name = "dependency-parser" root = "TacticParser.DependencyParserMain" From 3d11814c60a0d3ca61a1f3524ae0422711c0efa2 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Mon, 17 Nov 2025 07:12:21 +0000 Subject: [PATCH 06/25] Enhance proof extraction by allowing optional extra content in extractProofFromDecl --- .../TacticParser/ProofExtractor.lean | 76 ++++++------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean index efa21b9..6f50b31 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean @@ -83,7 +83,8 @@ def get_in_between_content (fileContent : String) (startLine : Nat) (endLine : N /-- Extract proof from a declaration by testing candidate delimiters -/ unsafe def extractProofFromDecl (declInfo : DeclInfo) - (cmdState : Command.State) : IO DeclInfo := do + (cmdState : Command.State) + (extra_content: Option String := none) : IO DeclInfo := do -- Only process theorem, lemma, example if !is_proof_extraction_needed declInfo then panic! s!"extractProofFromDecl called on non-proof decl: {declInfo.declType}" @@ -101,14 +102,20 @@ unsafe def extractProofFromDecl -- IO.println s!"beforeDelimiter:\n{beforeDelimiter}\n---" -- Build test text with full context - let statementOnly := match candidate.delimiterType with + let mut statementOnly := match candidate.delimiterType with | "|" => beforeDelimiter.trim ++ " := sorry" | ":=" => beforeDelimiter ++ " := sorry" | "where" => beforeDelimiter ++ " := sorry" | _ => beforeDelimiter ++ " := sorry" - -- IO.println s!"statementOnly:\n{statementOnly}\n---" + -- If extra content is provided, prepend it + statementOnly := + match extra_content with + | some extra => extra ++ (if extra.endsWith "\n" then statementOnly else "\n" ++ statementOnly) + | none => statementOnly + + -- IO.println s!"statementOnly:\n{statementOnly}\n---" -- Try to parse let success ← tryParseSuccessfully statementOnly cmdState if success then @@ -125,18 +132,17 @@ unsafe def extractProofFromDecl unsafe def parse_between (fileContent : String) -(decl: DeclInfo) (prev: Nat) (next: Nat) (cmd_state : Option Command.State) -: IO (Nat × CheckpointedParseResult) := do +: IO CheckpointedParseResult := do -- IO.println s!"Extracting proof for segment between lines {prev} and {next}" let contextBeforeDecl := get_in_between_content fileContent prev next -- IO.println s!"Processing declaration at line {decl.startPos.line}-{decl.endPos.line}" -- IO.println s!"Declaration text:\n{decl.text}\n---" -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" let chkpt_parse_res ← parseTactics contextBeforeDecl none cmd_state - return (next, chkpt_parse_res) + return chkpt_parse_res /-- Extract proofs from multiple declarations -/ unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String) : IO (Array DeclInfo) := do @@ -144,61 +150,27 @@ unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String let mut prev := 0 let mut cmd_state : Option Command.State := none let mut next := 0 + let mut extra_content : Option String := none for decl in decls do if is_proof_extraction_needed decl then next := decl.startPos.line - 1 - let tup ← parse_between fileContent decl prev next cmd_state - let chkpt_parse_res := tup.2 + let chkpt_parse_res ← parse_between fileContent prev next cmd_state -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" - let parse_res := chkpt_parse_res.1 - let mut decl_fixed := decl + let parse_res := chkpt_parse_res.parseResult if parse_res.errors.size > 0 then - -- we should reparse with 1 less line to avoid partial lines - let last_line_num := decl.startPos.line - 1 - let new_decl := { decl with startPos := { decl.startPos with line := last_line_num } } - next := last_line_num - 1 - -- IO.println s!"Reparsing context before declaration at line {decl.startPos.line} with one less line" - let tup ← parse_between fileContent decl prev next cmd_state - let chkpt_parse_res := tup.2 - let mut missing_decl_text := get_in_between_content fileContent last_line_num last_line_num - missing_decl_text := missing_decl_text.trim - decl_fixed := { new_decl with text := missing_decl_text ++ "\n" ++ decl.text } - if chkpt_parse_res.parseResult.errors.size > 0 then - IO.println s!"Reparsing failed again for context before declaration at line {decl.startPos.line}-{decl.endPos.line}" - IO.println s!"Errors:" - for error in chkpt_parse_res.parseResult.errors do - IO.println s!"Error while parsing context before declaration at line {decl.startPos.line}: - {error.message} at {error.position.line}:{error.position.column}" - IO.println s!"--- Context Before Decl ----\n{get_in_between_content fileContent prev next}\n--- Context Before Decl ----\n" - IO.println s!"Decl text:\n{decl_fixed.text}\n---" - panic! "Failed to parse context before declaration after reparsing" - cmd_state := chkpt_parse_res.2 - prev := next - next := tup.1 + -- supply the extra content to compile from + extra_content := get_in_between_content fileContent prev next + -- IO.println s!"Re-parsing declaration at lines: \n{extra_content.get!}" + -- IO.println s!"\nDeclaration text:\n{decl.text}\n---" + -- DO NOT update cmd_state yet else - cmd_state := chkpt_parse_res.2 - prev := next - next := tup.1 - -- for error in parse_res.errors do - -- IO.println s!"Error while parsing context before declaration at line {decl.startPos.line}: {error.message} at {error.position.line}:{error.position.column}" - -- if parse_res.errors.size > 0 then - -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" - -- IO.println s!"Failed to parse context before declaration at line {decl.startPos.line}" - -- panic! "Failed to parse context" - -- IO.println s!"Context before decl:\n{contextBeforeDecl}\n---" - -- let cmd_state_error_pair ← get_cmd_state contextBeforeDecl cmd_state - -- cmd_state := cmd_state_error_pair.1 - -- let errors := cmd_state_error_pair.2 - -- for err in errors do - -- IO.println s!"Error while updating cmd_state: {err.message} at {err.position.line}:{err.position.column}" - -- if cmd_state.isNone ∨ errors.size > 0 then - -- IO.println s!"Failed to update cmd_state before processing declaration at line {decl.startPos.line}" - -- panic! "Failed to update cmd_state" - -- else + cmd_state := chkpt_parse_res.chkptState + extra_content := none + prev := next + 1 let cmd_st ← match cmd_state with | some st => pure st | none => panic! "Failed to get valid cmd_state before processing declaration" - let processed ← extractProofFromDecl decl_fixed cmd_st + let processed ← extractProofFromDecl decl cmd_st extra_content result := result.push processed else result := result.push decl From 58810a8f5627bc8e115833b6928954f62e8c9838 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Mon, 17 Nov 2025 10:00:10 +0000 Subject: [PATCH 07/25] Add Ray initialization utilities and configuration for data extraction --- .../configs/mathlib_lean_data_extract.yaml | 13 +++ src/itp_interface/main/init_ray.py | 79 +++++++++++++++++++ src/itp_interface/main/run_tool.py | 2 + .../lean4_local_data_extraction_transform.py | 30 +++---- src/itp_interface/tools/ray_utils.py | 29 +++++++ 5 files changed, 139 insertions(+), 14 deletions(-) create mode 100644 src/itp_interface/main/configs/mathlib_lean_data_extract.yaml create mode 100644 src/itp_interface/main/init_ray.py diff --git a/src/itp_interface/main/configs/mathlib_lean_data_extract.yaml b/src/itp_interface/main/configs/mathlib_lean_data_extract.yaml new file mode 100644 index 0000000..14e39be --- /dev/null +++ b/src/itp_interface/main/configs/mathlib_lean_data_extract.yaml @@ -0,0 +1,13 @@ +defaults: + # - benchmark: simple_benchmark_lean_training_data + # - run_settings: default_lean_data_generation_transforms + # - benchmark: simple_benchmark_1 + # - run_settings: default_lean4_data_generation_transforms + - benchmark: mathlib_benchmark_lean_ext + - run_settings: default_lean4_data_generation_transforms + - env_settings: no_retrieval + - override hydra/job_logging: 'disabled' + +run_settings: + output_dir: .log/data_generation/benchmark/mathlib_benchmark_lean_ext + pool_size: 12 \ No newline at end of file diff --git a/src/itp_interface/main/init_ray.py b/src/itp_interface/main/init_ray.py new file mode 100644 index 0000000..41c72e8 --- /dev/null +++ b/src/itp_interface/main/init_ray.py @@ -0,0 +1,79 @@ +def main(): + # Start the ray cluster + from filelock import FileLock + import json + import os + import ray + import logging + import time + import sys + import argparse + argument_parser = argparse.ArgumentParser() + argument_parser.add_argument("--num_cpus", type=int, default=10) + argument_parser.add_argument("--object_store_memory", type=int, default=150*2**30) + argument_parser.add_argument("--memory", type=int, default=300*2**30) + argument_parser.add_argument("--metrics_report_interval_ms", type=int, default=3*10**8) + args = argument_parser.parse_args() + root_dir = f"{os.path.abspath(__file__).split('itp_interface')[-2]}" + if root_dir not in sys.path: + sys.path.append(root_dir) + os.environ["RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"] = "1" + os.environ["PYTHONPATH"] = f"{root_dir}:{os.environ.get('PYTHONPATH', '')}" + os.makedirs(".log/locks", exist_ok=True) + os.makedirs(".log/ray", exist_ok=True) + ray_was_started = False + pid = os.getpid() + print("Initializing Ray") + print("PID: ", pid) + ray_session_path = ".log/ray/session_latest" if os.environ.get("RAY_SESSION_PATH") is None else os.environ.get("RAY_SESSION_PATH") + # Try to first acquire the lock + file_path = ".log/locks/ray.lock" + # set RAY_SESSION_PATH environment variable + os.environ["RAY_SESSION_PATH"] = ray_session_path + # set lock file path + os.environ["RAY_LOCK_FILE_PATH"] = file_path + temp_lock = FileLock(file_path) + if os.path.exists(ray_session_path): + # try to acquire the lock for reading + try: + temp_lock.acquire(timeout=10) + temp_lock.release() + except: + with open(ray_session_path, "r") as f: + ray_session = f.read() + ray_session = json.loads(ray_session) + ray_address = ray_session["address"] + # ray.init(address=ray_address) + print("Ray was already started") + print("Ray session: ", ray_session) + sys.exit(0) + with FileLock(file_path): + if os.path.exists(ray_session_path): + # Remove the ray_session_path + os.remove(ray_session_path) + os.environ["RAY_INITIALIZED"] = "1" + ray_session = ray.init( + num_cpus=args.num_cpus, + object_store_memory=args.object_store_memory, + _memory=args.memory, + logging_level=logging.CRITICAL, + ignore_reinit_error=False, + log_to_driver=False, + configure_logging=False, + _system_config={"metrics_report_interval_ms": args.metrics_report_interval_ms}) + ray_session = dict(ray_session) + ray_session["main_pid"] = pid + print("Ray session: ", ray_session) + with open(ray_session_path, "w") as f: + f.write(json.dumps(ray_session)) + ray_was_started = True + print("Ray was started") + print("Ray session: ", ray_session) + # Flush the stdout buffer + sys.stdout.flush() + while ray_was_started: + # Keep the ray cluster alive till killed + time.sleep(10000) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/itp_interface/main/run_tool.py b/src/itp_interface/main/run_tool.py index a6ce92d..4778480 100644 --- a/src/itp_interface/main/run_tool.py +++ b/src/itp_interface/main/run_tool.py @@ -607,4 +607,6 @@ def main(cfg): if __name__ == "__main__": # from itp_interface.tools.ray_utils import RayUtils # RayUtils.init_ray(num_of_cpus=20, object_store_memory_in_gb=50, memory_in_gb=1, runtime_env={"working_dir": root_dir, "excludes": [".log", "data"]}) + from itp_interface.tools.ray_utils import RayUtils + RayUtils.init_ray(num_of_cpus=20, object_store_memory_in_gb=100, memory_in_gb=50) main() \ No newline at end of file diff --git a/src/itp_interface/tools/lean4_local_data_extraction_transform.py b/src/itp_interface/tools/lean4_local_data_extraction_transform.py index 673d0fb..210fd87 100644 --- a/src/itp_interface/tools/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/tools/lean4_local_data_extraction_transform.py @@ -23,12 +23,14 @@ def __init__(self, buffer_size : int = 10000, logger = None, max_parallelism : int = 4, - db_path : typing.Optional[str] = None): + db_path : typing.Optional[str] = None, + enable_file_export: bool = True): super().__init__(TrainingDataGenerationType.LOCAL, buffer_size, logger) self.depth = depth self.max_search_results = max_search_results self.max_parallelism = max_parallelism self.db_path = db_path # Store path, don't create connection yet (for Ray actors) + self.enable_file_export = enable_file_export def get_meta_object(self) -> TrainingDataMetadataFormat: return TrainingDataMetadataFormat( @@ -149,19 +151,19 @@ def __call__(self, timestamp = str(int(uuid.uuid1().time_low)) random_id = str(uuid.uuid4()) decl_id = f"{timestamp}_{random_id}" - - new_fda = FileDependencyAnalysis( - file_path=str(fda_rel_path), - module_name=fda_module_name, - imports=fda.imports, - declarations=[]) - line_info = decl.decl_info - if theorems is not None and line_info.name not in theorems: - continue - decl.decl_id = decl_id - new_fda.declarations.append(decl) - training_data.merge(new_fda) - cnt += 1 + if self.enable_file_export: + new_fda = FileDependencyAnalysis( + file_path=str(fda_rel_path), + module_name=fda_module_name, + imports=fda.imports, + declarations=[]) + line_info = decl.decl_info + if theorems is not None and line_info.name not in theorems: + continue + decl.decl_id = decl_id + new_fda.declarations.append(decl) + training_data.merge(new_fda) + cnt += 1 last_decl_id = decl_id if last_decl_id: diff --git a/src/itp_interface/tools/ray_utils.py b/src/itp_interface/tools/ray_utils.py index 807ef7d..d462740 100644 --- a/src/itp_interface/tools/ray_utils.py +++ b/src/itp_interface/tools/ray_utils.py @@ -14,12 +14,41 @@ class RayUtils(object): + @staticmethod + def connect_to_ray(): + if os.environ.get("RAY_INITIALIZED", "0") == "0": + return None + root_dir = f"{os.path.abspath(__file__).split('itp_interface')[-2]}" + os.environ["PYTHONPATH"] = f"{root_dir}:{os.environ.get('PYTHONPATH', '')}" + from filelock import FileLock + import json + os.environ["RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"] = "1" + lock_path = os.environ.get("RAY_LOCK_FILE_PATH", ".log/locks/ray.lock") + ray_session_path = os.environ.get("RAY_SESSION_PATH", ".log/ray/session_latest") + temp_lock = FileLock(lock_path) + try: + temp_lock.acquire(timeout=10) + temp_lock.release() + except: + if os.path.exists(ray_session_path): + with open(ray_session_path, "r") as f: + ray_session = f.read() + ray_session = json.loads(ray_session) + ray_address = ray_session["address"] + obj = ray.init(address=ray_address) + return obj + return None + @staticmethod def init_ray(num_of_cpus: int = 10, object_store_memory_in_gb: float = 25, memory_in_gb: float = 0.5, runtime_env: typing.Dict[str, str] = None): gb = 2**30 object_store_memory = int(object_store_memory_in_gb * gb) memory = int(memory_in_gb * gb) + obj = RayUtils.connect_to_ray() + if obj is not None: + return obj os.environ["RAY_INITIALIZED"] = "1" + os.environ["RAY_OBJECT_STORE_ALLOW_SLOW_STORAGE"] = "1" obj = ray.init(num_cpus=num_of_cpus, object_store_memory=object_store_memory, _memory=memory, ignore_reinit_error=True, runtime_env=runtime_env) return obj From 0a4ce24bc904702b8d0ff221742fff59721a224d Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Mon, 17 Nov 2025 10:00:36 +0000 Subject: [PATCH 08/25] Add error printing function and improve proof extraction logic --- src/itp_interface/tools/tactic_parser.py | 50 +++++++++++++------ .../TacticParser/ProofExtractor.lean | 16 +++--- 2 files changed, 42 insertions(+), 24 deletions(-) diff --git a/src/itp_interface/tools/tactic_parser.py b/src/itp_interface/tools/tactic_parser.py index 66aaae0..9c8a6d1 100644 --- a/src/itp_interface/tools/tactic_parser.py +++ b/src/itp_interface/tools/tactic_parser.py @@ -753,6 +753,14 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] else: print(msg) +def print_errors(errors: List[ErrorInfo], logger: Optional[logging.Logger] = None): + for error in errors: + msg = f"Error at Line {error.position.line}, Col {error.position.column}: {error.message}" + if logger: + logger.error(msg) + else: + print(msg) + if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) project_path = str(Path(__file__).parent.parent.parent / "data" / "test" / "lean4_proj") @@ -770,6 +778,16 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] if errors: print(f"Error: {errors}") + pp = "src/data/test/Mathlib" + + with TacticParser(project_path=pp) as parser: + with open(os.path.join(pp, "Basic.lean")) as f: + lean_code = f.read() + tactics, errors = parser.parse(lean_code, fail_on_error=False) + print_tactics(tactics) + if errors: + print_errors(errors) + with TacticParser() as parser: # Example 1: Simple proof lean_code = "example : True := by trivial" @@ -778,7 +796,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics, errors = parser.parse(lean_code) print_tactics(tactics) if errors: - print(f"Error: {errors}") + print_errors(errors) p_path = "/home/amthakur/Projects/copra/data/test/miniF2F-lean4" with TacticParser(project_path=p_path) as parser: @@ -797,7 +815,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics, errors = parser.parse(lean_code, fail_on_error=False) print_tactics(tactics) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=p_path) as parser: # Example 1a: Simple proof with multiple tactics @@ -818,7 +836,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics, errors = parser.parse(lean_code, fail_on_error=False) print_tactics(tactics) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=p_path) as parser: # Example 1a: Simple proof with multiple tactics @@ -839,7 +857,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics, errors = parser.parse(lean_code, fail_on_error=False) print_tactics(tactics) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser() as parser: # Example 1b: Simple have proofs @@ -856,7 +874,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics, errors = parser.parse(lean_code, fail_on_error=False) print_tactics(tactics) if errors: - print(f"Error: {errors}") + print_errors(errors) @@ -868,7 +886,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics2, errors = parser.parse(lean_code2, fail_on_error=False) print_tactics(tactics2) if errors: - print(f"Error: {errors}") + print_errors(errors) # Check if linarith is parsed correctly lean_code3 = """ @@ -885,7 +903,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics3, errors = parser.parse(lean_code3) print_tactics(tactics3) if errors: - print(f"Error: {errors}") + print_errors(errors) file_path = str(Path(__file__).parent.parent.parent / "data" / "test" / "lean4_proj" / "Lean4Proj" / "Basic.lean") @@ -895,7 +913,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics4, errors = parser.parse_file(file_path) print_tactics(tactics4) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 2: Multiline with params @@ -905,7 +923,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics5, errors = parser.parse(lean_code4) print_tactics(tactics5) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 6: Parse tactics from file with multiple theorems @@ -913,7 +931,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics6, errors = parser.parse(lean_code3 + "\n" + lean_code4, parse_type=RequestType.PARSE_TACTICS) print_tactics(tactics6) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 7: Parse tactics which are wrong @@ -922,7 +940,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics7, errors = parser.parse(lean_code5, fail_on_error=False) print_tactics(tactics7) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 8: Parse tactics just before `by` @@ -931,7 +949,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics8, errors = parser.parse(lean_code8, fail_on_error=False) print_tactics(tactics8) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 9: Parse tactics just before `by` @@ -940,7 +958,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics9, errors = parser.parse(lean_code9, fail_on_error=False) print_tactics(tactics9) if errors: - print(f"Error: {errors}") + print_errors(errors) with TacticParser(project_path=project_path) as parser: # Example 10: Test checkpointing @@ -958,7 +976,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] tactics10, errors = parser.parse(lean_code10, fail_on_error=True, parse_type=RequestType.CHKPT_TACTICS) print_tactics(tactics10) if errors: - print(f"Error: {errors}") + print_errors(errors) # Now just execute from the checkpoint lean_code10b = """ theorem temp2: 1 + 2 = 3 := @@ -970,7 +988,7 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] print_tactics(tactics10b) if errors: # The error should contain h_temp - print(f"Error: {errors}") + print_errors(errors) print("\nBreaking checkpoint...") new_lean_code10c = lean_code10 + lean_code10b @@ -978,4 +996,4 @@ def print_tactics(tactics: List[LeanLineInfo], logger: Optional[logging.Logger] # ^This will reimport everything all run all theorems from scratch print_tactics(tactics10c) if errors: - print(f"Error: {errors}") \ No newline at end of file + print_errors(errors) \ No newline at end of file diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean index 6f50b31..368e198 100644 --- a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean +++ b/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean @@ -23,8 +23,8 @@ instance : Ord DelimiterCandidate where compare a b := compare a.position b.position /-- Try to parse a text snippet and return true if it parses without errors -/ -def tryParseSuccessfully (text : String) (cmdState : Command.State): IO Bool := do - let chkpt_result ← parseTactics text none (some cmdState) +def tryParseSuccessfully (text : String) (cmdState : Option Command.State): IO Bool := do + let chkpt_result ← parseTactics text none cmdState let parse_res := chkpt_result.1 let new_cmd_state := chkpt_result.2 pure (parse_res.errors.size == 0 ∧ new_cmd_state.isSome) @@ -83,7 +83,7 @@ def get_in_between_content (fileContent : String) (startLine : Nat) (endLine : N /-- Extract proof from a declaration by testing candidate delimiters -/ unsafe def extractProofFromDecl (declInfo : DeclInfo) - (cmdState : Command.State) + (cmdState : Option Command.State) (extra_content: Option String := none) : IO DeclInfo := do -- Only process theorem, lemma, example if !is_proof_extraction_needed declInfo then @@ -157,7 +157,7 @@ unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String let chkpt_parse_res ← parse_between fileContent prev next cmd_state -- IO.println s!"--- Context Before Decl ----\n{contextBeforeDecl}\n--- Context Before Decl ----\n" let parse_res := chkpt_parse_res.parseResult - if parse_res.errors.size > 0 then + if parse_res.errors.size > 0 ∨ chkpt_parse_res.chkptState.isNone then -- supply the extra content to compile from extra_content := get_in_between_content fileContent prev next -- IO.println s!"Re-parsing declaration at lines: \n{extra_content.get!}" @@ -167,10 +167,10 @@ unsafe def extractProofsFromDecls (decls : Array DeclInfo) (fileContent : String cmd_state := chkpt_parse_res.chkptState extra_content := none prev := next + 1 - let cmd_st ← match cmd_state with - | some st => pure st - | none => panic! "Failed to get valid cmd_state before processing declaration" - let processed ← extractProofFromDecl decl cmd_st extra_content + -- let cmd_st ← match cmd_state with + -- | some st => pure st + -- | none => panic! "Failed to get valid cmd_state before processing declaration" + let processed ← extractProofFromDecl decl cmd_state extra_content result := result.push processed else result := result.push decl From 3d5b1e96ea6392bb2417da2cdb9dc7e75e26ea0e Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Mon, 17 Nov 2025 10:00:53 +0000 Subject: [PATCH 09/25] Update optional dependencies for app to include Streamlit, Plotly, NetworkX, and Pandas --- pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e9aac2c..5e1ec6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,8 +47,10 @@ dependencies = [ [project.optional-dependencies] app = [ - "flask>=2.3.0", - "flask-cors>=4.0.0" + "streamlit>=1.28.0", + "plotly>=5.17.0", + "networkx>=3.1", + "pandas>=2.0.0" ] [project.urls] From c9975f1a8caf75763be528d90efcc0c70262f8a2 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Tue, 18 Nov 2025 06:38:13 +0000 Subject: [PATCH 10/25] Added batteries --- .gitmodules | 3 +++ src/data/test/batteries | 1 + 2 files changed, 4 insertions(+) create mode 160000 src/data/test/batteries diff --git a/.gitmodules b/.gitmodules index 65bf71a..1b68b33 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "src/itp_interface/tools/repl"] path = src/itp_interface/tools/repl url = https://github.com/amit9oct/repl.git +[submodule "src/data/test/batteries"] + path = src/data/test/batteries + url = https://github.com/leanprover-community/batteries.git diff --git a/src/data/test/batteries b/src/data/test/batteries new file mode 160000 index 0000000..8da40b7 --- /dev/null +++ b/src/data/test/batteries @@ -0,0 +1 @@ +Subproject commit 8da40b72fece29b7d3fe3d768bac4c8910ce9bee From 087578d3fe1e9d753585b32c9695e195a21d3bbb Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Tue, 18 Nov 2025 06:40:10 +0000 Subject: [PATCH 11/25] Uncomment unittest.main() in main function and comment out specific Lean4 tests --- src/test/simple_env_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/test/simple_env_test.py b/src/test/simple_env_test.py index 8306fb5..6ca797f 100644 --- a/src/test/simple_env_test.py +++ b/src/test/simple_env_test.py @@ -678,9 +678,9 @@ def test_simple_lean4_multiline_multigoal(self): assert proof_was_finished, "Proof was not finished" def main(): - # unittest.main() + unittest.main() # Run only the Lean 4 tests - t = Lean4Test() + # t = Lean4Test() # t.test_simple_lean4_multiline_multigoal() # t.test_simple_lean4() # t.test_lean4_backtracking() @@ -688,7 +688,7 @@ def main(): # t.test_simple_lean_calc() # t.test_simple_lean_calc_with_validation() # t.test_simple_lean4_with_error() - t.test_simple_lean4_have_test() + # t.test_simple_lean4_have_test() # t.test_simple_lean_enforce_done_test() From b4de7870a714055ab8980d00c323fa9cd039277f Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 18:10:34 +0000 Subject: [PATCH 12/25] Simple refactoring. --- src/itp_interface/lean/__init__.py | 0 src/itp_interface/lean/parsing_helpers.py | 300 ++++++++++++++++++ .../simple_lean4_sync_executor.py | 2 +- .../{tools => lean}/tactic_parser.py | 0 .../tactic_parser/TacticParser.lean | 0 .../tactic_parser/TacticParser/Base64.lean | 0 .../TacticParser/DependencyParser.lean | 0 .../TacticParser/DependencyParserMain.lean | 0 .../TacticParser/Example/complex.lean | 0 .../TacticParser/Example/simple.lean | 0 .../TacticParser/LineParser.lean | 0 .../tactic_parser/TacticParser/Main.lean | 0 .../TacticParser/ProofExtractor.lean | 0 .../TacticParser/SyntaxWalker.lean | 0 .../TacticParser/TacticExtractorMain.lean | 0 .../tactic_parser/TacticParser/Types.lean | 0 .../tactic_parser/lake-manifest.json | 0 .../tactic_parser/lakefile.toml | 0 .../tactic_parser/lean-toolchain | 0 .../tactic_parser/test_user_example.lean | 0 src/itp_interface/main/install.py | 2 +- .../tools/dynamic_lean4_proof_exec.py | 2 +- .../lean4_local_data_extraction_transform.py | 49 ++- .../lean4_local_data_generation_transform.py | 2 +- .../tools/proof_exec_callback.py | 2 +- .../tools/run_data_generation_transforms.py | 2 +- .../tools/training_data_format.py | 2 +- src/test/parsing_helpers_test.py | 65 ++++ src/test/simple_env_test.py | 2 +- src/test/test_tactic_parser.py | 2 +- 30 files changed, 416 insertions(+), 16 deletions(-) create mode 100644 src/itp_interface/lean/__init__.py create mode 100644 src/itp_interface/lean/parsing_helpers.py rename src/itp_interface/{tools => lean}/simple_lean4_sync_executor.py (99%) rename src/itp_interface/{tools => lean}/tactic_parser.py (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/Base64.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/DependencyParser.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/DependencyParserMain.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/Example/complex.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/Example/simple.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/LineParser.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/Main.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/ProofExtractor.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/SyntaxWalker.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/TacticExtractorMain.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/TacticParser/Types.lean (100%) rename src/itp_interface/{tools => lean}/tactic_parser/lake-manifest.json (100%) rename src/itp_interface/{tools => lean}/tactic_parser/lakefile.toml (100%) rename src/itp_interface/{tools => lean}/tactic_parser/lean-toolchain (100%) rename src/itp_interface/{tools => lean}/tactic_parser/test_user_example.lean (100%) create mode 100644 src/test/parsing_helpers_test.py diff --git a/src/itp_interface/lean/__init__.py b/src/itp_interface/lean/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/itp_interface/lean/parsing_helpers.py b/src/itp_interface/lean/parsing_helpers.py new file mode 100644 index 0000000..8ff13ee --- /dev/null +++ b/src/itp_interface/lean/parsing_helpers.py @@ -0,0 +1,300 @@ +import sys +import re +from enum import Enum +from dataclasses import dataclass +from typing import Optional, Set + +class LeanDeclType(Enum): + LEMMA = "lemma" + THEOREM = "theorem" + DEF = "def" + DEFINITION = "definition" + STRUCTURE = "structure" + CLASS = "class" + INDUCTIVE = "inductive" + INSTANCE = "instance" + ABBREV = "abbrev" + ABBREVIATION = "abbreviation" + AXIOM = "axiom" + EXAMPLE = "example" + OPAQUE = "opaque" + CONSTANT = "constant" + MUTUAL = "mutual" + UNKNOWN = "unknown" + +@dataclass +class LeanParseResult: + decl_type: LeanDeclType + text_before: Optional[str] = None + doc_string: Optional[str] = None + text: Optional[str] = None + proof: Optional[str] = None + +class LeanDeclParser: + """ + Parses Lean 4 declarations to separate context, docstrings, + declaration headers, and proofs/bodies. + """ + + # Keywords that mark the start of a declaration + # We generate this set from the Enum values for fast lookup + DECL_KEYWORDS = {m.value for m in LeanDeclType if m != LeanDeclType.UNKNOWN} + + # Types for which we should NOT attempt to extract a proof/body + NO_PROOF_TYPES = { + LeanDeclType.INDUCTIVE, + LeanDeclType.MUTUAL, + LeanDeclType.STRUCTURE, + LeanDeclType.CLASS + } + + def __init__(self, text: str): + self.text = text + self.n = len(text) + self.tokens = [] + self.docstring_range = None # (start, end) + self.docstring_content = None + + # Key Indices and Info + self.decl_start = -1 + self.proof_start = -1 + self.context_end = 0 + self.decl_type: LeanDeclType = LeanDeclType.UNKNOWN + + def parse(self) -> LeanParseResult: + self._tokenize() + self._analyze_structure() + return self._construct_result() + + def _tokenize(self): + """ + Scans text to find tokens, respecting comments, strings, and nesting. + """ + i = 0 + # States + NORMAL = 0 + IN_STRING = 1 + IN_CHAR = 2 + + state = NORMAL + nesting = 0 # () [] {} + + while i < self.n: + # 1. Handle Comments (Line and Block) + if state == NORMAL: + if self.text.startswith("--", i): + # Line comment + end_line = self.text.find('\n', i) + if end_line == -1: end_line = self.n + i = end_line + continue + + if self.text.startswith("/-", i): + # Block comment + is_doc = self.text.startswith("/--", i) + start_idx = i + + # Find end of block comment (handle nesting) + depth = 1 + i += 2 + while i < self.n and depth > 0: + if self.text.startswith("/-", i): + depth += 1 + i += 2 + elif self.text.startswith("-/", i): + depth -= 1 + i += 2 + else: + i += 1 + + if is_doc and self.docstring_range is None: + self.docstring_range = (start_idx, i) + self.docstring_content = self.text[start_idx:i] + continue + + # 2. Handle Strings/Chars + if state == NORMAL: + if self.text[i] == '"': + state = IN_STRING + i += 1 + continue + if self.text[i] == "'": + state = IN_CHAR + i += 1 + continue + + elif state == IN_STRING: + if self.text[i] == '\\': i += 2; continue + if self.text[i] == '"': state = NORMAL; i += 1; continue + i += 1 + continue + + elif state == IN_CHAR: + if self.text[i] == '\\': i += 2; continue + if self.text[i] == "'": state = NORMAL; i += 1; continue + i += 1 + continue + + # 3. Handle Structure Tokens in NORMAL state + char = self.text[i] + + # Nesting tracking + if char in "([{": nesting += 1 + elif char in ")]}": nesting = max(0, nesting - 1) + + # Token detection + if nesting == 0: + # Check for 'in' keyword (standalone) + if self._is_keyword_at(i, "in"): + self.tokens.append(("IN", i, i+2)) + i += 2 + continue + + # Check for Declaration Keywords + match_kw = self._match_any_keyword(i, self.DECL_KEYWORDS) + if match_kw: + kw, length = match_kw + self.tokens.append(("KW", i, i+length)) + i += length + continue + + # Check for Attribute Start + if self.text.startswith("@[", i): + self.tokens.append(("ATTR", i, i+2)) + i += 2 + nesting += 1 # The '[' counts as nesting + continue + + # Check for Proof Starters + if self.text.startswith(":=", i): + self.tokens.append(("PROOF", i, i+2)) + i += 2 + continue + if self.text.startswith("where", i) and self._is_word_boundary(i+5): + self.tokens.append(("PROOF", i, i+5)) + i += 5 + continue + if char == '|': + self.tokens.append(("PROOF", i, i+1)) + i += 1 + continue + + i += 1 + + def _analyze_structure(self): + """ + Interpret the token stream to find split points. + """ + candidate_decl = -1 + decl_keyword_str = None + + # Pass 1: Find Declaration Start + for t_type, t_start, t_end in self.tokens: + if t_type == "IN": + candidate_decl = -1 # Reset candidate + decl_keyword_str = None + + elif t_type == "KW": + if candidate_decl == -1: + candidate_decl = t_start + if decl_keyword_str is None: + decl_keyword_str = self.text[t_start:t_end] + + elif t_type == "ATTR": + if candidate_decl == -1: + candidate_decl = t_start + + if candidate_decl != -1: + self.decl_start = candidate_decl + + # Resolve Enum Type + if decl_keyword_str: + try: + self.decl_type = LeanDeclType(decl_keyword_str) + except ValueError: + self.decl_type = LeanDeclType.UNKNOWN + else: + self.decl_type = LeanDeclType.UNKNOWN + + # Pass 2: Find Proof Start + skip_proof = self.decl_type in self.NO_PROOF_TYPES + + if not skip_proof: + for t_type, t_start, t_end in self.tokens: + if t_start > self.decl_start and t_type == "PROOF": + self.proof_start = t_start + break + else: + # Fallback: No declaration found + pass + + def _construct_result(self) -> LeanParseResult: + # Case 1: No declaration found + if self.decl_start == -1: + return LeanParseResult( + decl_type=LeanDeclType.UNKNOWN, + text_before=self.text + ) + + # Case 2: Declaration found + split_idx = self.decl_start + + decl_end = self.n + proof_content = None + if self.proof_start != -1: + decl_end = self.proof_start + proof_content = self.text[self.proof_start:].strip() + + raw_pre = self.text[:split_idx] + raw_decl = self.text[split_idx:decl_end] + doc_content = None + + # Handle Docstring Extraction logic + if self.docstring_range: + ds_start, ds_end = self.docstring_range + doc_content = self.text[ds_start:ds_end] + + if ds_start < split_idx: + # Remove docstring from raw_pre + pre_part1 = self.text[:ds_start] + pre_part2 = self.text[ds_end:split_idx] + raw_pre = pre_part1 + pre_part2 + + # Final cleanup + text_before = raw_pre.strip() or None + text = raw_decl.strip() or None + doc = doc_content or None + proof = proof_content or None + + return LeanParseResult( + decl_type=self.decl_type, + text_before=text_before, + doc_string=doc, + text=text, + proof=proof + ) + + # --- Helpers --- + def _is_keyword_at(self, idx, kw): + if not self.text.startswith(kw, idx): return False + return self._is_word_boundary(idx + len(kw)) + + def _match_any_keyword(self, idx, keywords): + if not self.text[idx].isalpha(): return None + j = idx + while j < self.n and (self.text[j].isalnum() or self.text[j] == '_'): + j += 1 + word = self.text[idx:j] + if word in keywords: + return word, len(word) + return None + + def _is_word_boundary(self, idx): + if idx >= self.n: return True + c = self.text[idx] + return not (c.isalnum() or c == '_') + + +def parse_lean_text(text: str) -> LeanParseResult: + parser = LeanDeclParser(text) + return parser.parse() \ No newline at end of file diff --git a/src/itp_interface/tools/simple_lean4_sync_executor.py b/src/itp_interface/lean/simple_lean4_sync_executor.py similarity index 99% rename from src/itp_interface/tools/simple_lean4_sync_executor.py rename to src/itp_interface/lean/simple_lean4_sync_executor.py index 49250fc..b957aa0 100644 --- a/src/itp_interface/tools/simple_lean4_sync_executor.py +++ b/src/itp_interface/lean/simple_lean4_sync_executor.py @@ -10,7 +10,7 @@ import typing import bisect import subprocess -from itp_interface.tools.tactic_parser import ( +from itp_interface.lean.tactic_parser import ( TacticParser, ErrorInfo, LeanLineInfo, diff --git a/src/itp_interface/tools/tactic_parser.py b/src/itp_interface/lean/tactic_parser.py similarity index 100% rename from src/itp_interface/tools/tactic_parser.py rename to src/itp_interface/lean/tactic_parser.py diff --git a/src/itp_interface/tools/tactic_parser/TacticParser.lean b/src/itp_interface/lean/tactic_parser/TacticParser.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser.lean rename to src/itp_interface/lean/tactic_parser/TacticParser.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Base64.lean b/src/itp_interface/lean/tactic_parser/TacticParser/Base64.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/Base64.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/Base64.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean b/src/itp_interface/lean/tactic_parser/TacticParser/DependencyParser.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/DependencyParser.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/DependencyParser.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/DependencyParserMain.lean b/src/itp_interface/lean/tactic_parser/TacticParser/DependencyParserMain.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/DependencyParserMain.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/DependencyParserMain.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean b/src/itp_interface/lean/tactic_parser/TacticParser/Example/complex.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/Example/complex.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/Example/complex.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean b/src/itp_interface/lean/tactic_parser/TacticParser/Example/simple.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/Example/simple.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/Example/simple.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/LineParser.lean b/src/itp_interface/lean/tactic_parser/TacticParser/LineParser.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/LineParser.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/LineParser.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Main.lean b/src/itp_interface/lean/tactic_parser/TacticParser/Main.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/Main.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/Main.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean b/src/itp_interface/lean/tactic_parser/TacticParser/ProofExtractor.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/ProofExtractor.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/ProofExtractor.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean b/src/itp_interface/lean/tactic_parser/TacticParser/SyntaxWalker.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/SyntaxWalker.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/SyntaxWalker.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean b/src/itp_interface/lean/tactic_parser/TacticParser/TacticExtractorMain.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/TacticExtractorMain.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/TacticExtractorMain.lean diff --git a/src/itp_interface/tools/tactic_parser/TacticParser/Types.lean b/src/itp_interface/lean/tactic_parser/TacticParser/Types.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/TacticParser/Types.lean rename to src/itp_interface/lean/tactic_parser/TacticParser/Types.lean diff --git a/src/itp_interface/tools/tactic_parser/lake-manifest.json b/src/itp_interface/lean/tactic_parser/lake-manifest.json similarity index 100% rename from src/itp_interface/tools/tactic_parser/lake-manifest.json rename to src/itp_interface/lean/tactic_parser/lake-manifest.json diff --git a/src/itp_interface/tools/tactic_parser/lakefile.toml b/src/itp_interface/lean/tactic_parser/lakefile.toml similarity index 100% rename from src/itp_interface/tools/tactic_parser/lakefile.toml rename to src/itp_interface/lean/tactic_parser/lakefile.toml diff --git a/src/itp_interface/tools/tactic_parser/lean-toolchain b/src/itp_interface/lean/tactic_parser/lean-toolchain similarity index 100% rename from src/itp_interface/tools/tactic_parser/lean-toolchain rename to src/itp_interface/lean/tactic_parser/lean-toolchain diff --git a/src/itp_interface/tools/tactic_parser/test_user_example.lean b/src/itp_interface/lean/tactic_parser/test_user_example.lean similarity index 100% rename from src/itp_interface/tools/tactic_parser/test_user_example.lean rename to src/itp_interface/lean/tactic_parser/test_user_example.lean diff --git a/src/itp_interface/main/install.py b/src/itp_interface/main/install.py index e375f57..9bad6ed 100644 --- a/src/itp_interface/main/install.py +++ b/src/itp_interface/main/install.py @@ -3,7 +3,7 @@ import string import logging import traceback -from itp_interface.tools.tactic_parser import build_tactic_parser_if_needed +from itp_interface.lean.tactic_parser import build_tactic_parser_if_needed file_path = os.path.abspath(__file__) diff --git a/src/itp_interface/tools/dynamic_lean4_proof_exec.py b/src/itp_interface/tools/dynamic_lean4_proof_exec.py index 5417e83..86cd9c9 100644 --- a/src/itp_interface/tools/dynamic_lean4_proof_exec.py +++ b/src/itp_interface/tools/dynamic_lean4_proof_exec.py @@ -6,7 +6,7 @@ import copy import enum import logging -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.training_data_format import Goal, TheoremProvingTrainingDataFormat from itp_interface.tools.lean_parse_utils import LeanLineByLineReader from itp_interface.tools.lean_context_helper import Lean3ContextHelper diff --git a/src/itp_interface/tools/lean4_local_data_extraction_transform.py b/src/itp_interface/tools/lean4_local_data_extraction_transform.py index 210fd87..27e91f6 100644 --- a/src/itp_interface/tools/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/tools/lean4_local_data_extraction_transform.py @@ -9,11 +9,11 @@ import typing import uuid from pathlib import Path -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, ExtractionDataCollection from itp_interface.tools.training_data import TrainingData, DataLayoutFormat -from itp_interface.tools.tactic_parser import FileDependencyAnalysis +from itp_interface.lean.tactic_parser import FileDependencyAnalysis, DeclWithDependencies from itp_interface.tools.simple_sqlite import LeanDeclarationDB class Local4DataExtractionTransform(GenericTrainingDataGenerationTransform): @@ -87,6 +87,44 @@ def _get_module_name(self, project_path: str, module_name: str) -> str: module_name = self._remove_lake_package_prefix(module_name) return module_name + def _remap_local_dependency(self, + project_path: str, + file_dep_analyses: typing.List[FileDependencyAnalysis]) -> None: + # Map local dependencies + all_decls = {} + for fda in file_dep_analyses: + for decl in fda.declarations: + name = decl.decl_info.name + all_decls[name] = decl + for fda in file_dep_analyses: + fda_rel_path = self._get_file_path(project_path, fda.file_path) + # Remove the pp_module prefix from the fda module name + fda_module_name = self._get_module_name(project_path, fda.module_name) + for decl in fda.declarations: + for decl_dep in decl.dependencies: + if decl_dep.name in all_decls: + if decl_dep.file_path is None: + decl_dep.file_path = fda_rel_path + if decl_dep.module_name is None: + decl_dep.module_name = fda_module_name + if decl_dep.namespace is None: + decl_dep.namespace = decl.decl_info.namespace + + def _preprocess_declarations(self, + project_path: str, + file_dep_analyses: typing.List[FileDependencyAnalysis]) -> None: + # Preprocess declarations to set file paths and module names + for fda in file_dep_analyses: + for decl in fda.declarations: + self._preprocess_declaration(project_path, decl) + + def _preprocess_declaration( + self, + project_path: str, + decl: DeclWithDependencies) -> None: + pass + + def __call__(self, training_data: TrainingData, project_id : str, @@ -121,6 +159,8 @@ def __call__(self, assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" last_decl_id = None + self._remap_local_dependency(project_path, file_dep_analyses) + for fda in file_dep_analyses: fda_rel_path = self._get_file_path(project_path, fda.file_path) # Remove the pp_module prefix from the fda module name @@ -132,11 +172,6 @@ def __call__(self, self.logger.info(f"Inserted file and {len(fda.imports)} imports for {fda_rel_path}") for decl in fda.declarations: - # self.logger.info(f"Processing module: {project_path}") - # self.logger.info(f"Relative file path: {fda_rel_path}") - # self.logger.info(f"Original Module name: {fda.module_name}") - # self.logger.info(f"Rel Module name: {fda_module_name}") - # Get or create decl_id from database (or generate new one if no DB) if db: decl_id = db.process_declaration( diff --git a/src/itp_interface/tools/lean4_local_data_generation_transform.py b/src/itp_interface/tools/lean4_local_data_generation_transform.py index 4854b02..8550ff3 100644 --- a/src/itp_interface/tools/lean4_local_data_generation_transform.py +++ b/src/itp_interface/tools/lean4_local_data_generation_transform.py @@ -6,7 +6,7 @@ sys.path.append(root_dir) import typing import uuid -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.lean4_context_helper import Lean4ContextHelper from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, TheoremProvingTrainingDataCollection, TheoremProvingTrainingDataFormat diff --git a/src/itp_interface/tools/proof_exec_callback.py b/src/itp_interface/tools/proof_exec_callback.py index bcb9744..9c3b031 100644 --- a/src/itp_interface/tools/proof_exec_callback.py +++ b/src/itp_interface/tools/proof_exec_callback.py @@ -15,7 +15,7 @@ from itp_interface.tools.coq_executor import CoqExecutor from itp_interface.tools.lean_cmd_executor import Lean3Executor from itp_interface.tools.lean4_sync_executor import Lean4SyncExecutor -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.isabelle_executor import IsabelleExecutor from itp_interface.tools.dynamic_coq_proof_exec import DynamicProofExecutor as DynamicCoqProofExecutor from itp_interface.tools.dynamic_lean_proof_exec import DynamicProofExecutor as DynamicLeanProofExecutor diff --git a/src/itp_interface/tools/run_data_generation_transforms.py b/src/itp_interface/tools/run_data_generation_transforms.py index 4701c36..b4b0b48 100644 --- a/src/itp_interface/tools/run_data_generation_transforms.py +++ b/src/itp_interface/tools/run_data_generation_transforms.py @@ -24,7 +24,7 @@ RayUtils = None from itp_interface.tools.coq_executor import CoqExecutor from itp_interface.tools.lean_cmd_executor import Lean3Executor -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.tools.isabelle_executor import IsabelleExecutor from itp_interface.tools.coq_local_data_generation_transform import LocalDataGenerationTransform as CoqLocalDataGenerationTransform from itp_interface.tools.lean_local_data_generation_transform import LocalDataGenerationTransform as LeanLocalDataGenerationTransform diff --git a/src/itp_interface/tools/training_data_format.py b/src/itp_interface/tools/training_data_format.py index 0e55073..326aee7 100644 --- a/src/itp_interface/tools/training_data_format.py +++ b/src/itp_interface/tools/training_data_format.py @@ -12,7 +12,7 @@ from collections import OrderedDict from typing import List, Optional, Union, runtime_checkable, Protocol from pydantic import BaseModel -from itp_interface.tools.tactic_parser import FileDependencyAnalysis +from itp_interface.lean.tactic_parser import FileDependencyAnalysis @runtime_checkable class TrainingDataFormat(Protocol): diff --git a/src/test/parsing_helpers_test.py b/src/test/parsing_helpers_test.py new file mode 100644 index 0000000..fe73b85 --- /dev/null +++ b/src/test/parsing_helpers_test.py @@ -0,0 +1,65 @@ +import unittest +from itp_interface.lean.parsing_helpers import parse_lean_text, LeanDeclType + + +class ParsingHelpersTest(unittest.TestCase): + def test_lean_declaration_parsing(self): + """Test parsing of various Lean 4 declaration types""" + test_cases = [ + ( + "Simple Theorem", + "@[simp] lemma foo (x : Nat) : x = x := rfl" + ), + ( + "Context and Doc", + "open Algebra.TensorProduct in\n/-- Doc -/\ntheorem left_of_tensor [Module R] : True where\n out := sorry" + ), + ( + "Inductive (No Proof)", + "/-- The base e -/\ninductive ExBase : Type\n| A\n| B" + ), + ( + "Structure (No Proof)", + "structure MyStruct where\n field1 : Nat\n field2 : Int" + ), + ( + "Mutual (No Proof)", + "mutual\n inductive A | a\n inductive B | b\nend" + ), + ( + "Run Cmd (Fallback)", + "open Lean in\nrun_cmd Command.liftTermElabM do\n logInfo \"hi\"" + ) + ] + + print(f"\n{'TYPE':<15} | {'TEXT BEFORE':<20} | {'DOC':<10} | {'TEXT':<20} | {'PROOF':<15}") + print("-" * 105) + + for name, inp in test_cases: + res = parse_lean_text(inp) + + # Access fields via dot notation + tp = res.decl_type.value + tb = (res.text_before or "").replace('\n', '\\n') + ds = (res.doc_string or "") + tx = (res.text or "").replace('\n', '\\n') + pf = (res.proof or "").replace('\n', '\\n') + + if len(tb) > 18: tb = tb[:18] + "..." + if len(ds) > 8: ds = "/--...-/" + if len(tx) > 18: tx = tx[:18] + "..." + if len(pf) > 12: pf = pf[:12] + "..." + + print(f"{tp:<15} | {tb:<20} | {ds:<10} | {tx:<20} | {pf:<15}") + + # Basic assertions to verify parsing works + self.assertIsNotNone(res.decl_type) + self.assertIsInstance(res.decl_type, LeanDeclType) + + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() diff --git a/src/test/simple_env_test.py b/src/test/simple_env_test.py index 6ca797f..b17a857 100644 --- a/src/test/simple_env_test.py +++ b/src/test/simple_env_test.py @@ -1,6 +1,6 @@ import unittest import os -from itp_interface.tools.tactic_parser import build_lean4_project, build_tactic_parser_if_needed +from itp_interface.lean.tactic_parser import build_lean4_project, build_tactic_parser_if_needed def pretty_print(s1, s2, proof_step, done): print(f"Current Goal:") diff --git a/src/test/test_tactic_parser.py b/src/test/test_tactic_parser.py index cb79825..d988769 100644 --- a/src/test/test_tactic_parser.py +++ b/src/test/test_tactic_parser.py @@ -11,7 +11,7 @@ # Add parent directory to path to import tactic_parser sys.path.insert(0, str(Path(__file__).parent.parent / "itp_interface" / "tools")) -from itp_interface.tools.tactic_parser import TacticParser, print_tactics +from itp_interface.lean.tactic_parser import TacticParser, print_tactics project_path = str(Path(__file__).parent.parent / "data" / "test" / "lean4_proj") From 925ece1b6363c88d8411953debfe0a90577f1e35 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 18:24:17 +0000 Subject: [PATCH 13/25] Add build step for lean4_proj and implement log cleanup in CI workflows --- .../github-build-actions-python314t.yaml | 49 +++++++++++++++++++ .github/workflows/github-build-actions.yaml | 33 ++++++++++++- 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/.github/workflows/github-build-actions-python314t.yaml b/.github/workflows/github-build-actions-python314t.yaml index 010a72b..09c9c22 100644 --- a/.github/workflows/github-build-actions-python314t.yaml +++ b/.github/workflows/github-build-actions-python314t.yaml @@ -84,6 +84,12 @@ jobs: source $HOME/.elan/env install-itp-interface + - name: Build lean4_proj + shell: bash + run: | + source $HOME/.elan/env + cd src/data/test/lean4_proj && lake exe cache get && lake build + - name: Check and Init opam version run: | opam --version @@ -100,6 +106,17 @@ jobs: - name: List repository files (debug step) run: find . -type f + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + + - name: Ray Cleanup + shell: bash + run: | + rm -rf /tmp/* --verbose + + - name: Run Simple Env Test shell: bash run: | @@ -109,6 +126,17 @@ jobs: source $HOME/.elan/env python src/test/simple_env_test.py + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + + - name: Ray Cleanup + shell: bash + run: | + rm -rf /tmp/* --verbose + + - name: Run Data Gen Test shell: bash run: | @@ -118,6 +146,17 @@ jobs: source $HOME/.elan/env python src/test/simple_data_gen_test.py + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + + - name: Ray Cleanup + shell: bash + run: | + rm -rf /tmp/* --verbose + + - name: Run Data Extraction Test shell: bash run: | @@ -126,3 +165,13 @@ jobs: eval $(opam env) source $HOME/.elan/env python src/test/simple_data_extract_test.py + + - name: Ray Cleanup + shell: bash + run: | + rm -rf /tmp/* --verbose + + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" diff --git a/.github/workflows/github-build-actions.yaml b/.github/workflows/github-build-actions.yaml index 7814b40..e8d5e1a 100644 --- a/.github/workflows/github-build-actions.yaml +++ b/.github/workflows/github-build-actions.yaml @@ -53,6 +53,12 @@ jobs: source $HOME/.elan/env install-itp-interface + - name: Build lean4_proj + shell: bash + run: | + source $HOME/.elan/env + cd src/data/test/lean4_proj && lake exe cache get && lake build + - name: Check and Init opam version run: | opam --version @@ -69,18 +75,33 @@ jobs: - name: List repository files (debug step) run: find . -type f + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + - name: Run Simple Env Test shell: bash run: | eval $(opam env) source $HOME/.elan/env python src/test/simple_env_test.py - + + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + - name: Ray Cleanup shell: bash run: | rm -rf /tmp/* --verbose + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + - name: Run Data Gen Test shell: bash run: | @@ -88,6 +109,11 @@ jobs: source $HOME/.elan/env python src/test/simple_data_gen_test.py + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + - name: Ray Cleanup shell: bash run: | @@ -100,6 +126,11 @@ jobs: source $HOME/.elan/env python src/test/simple_data_extract_test.py + - name: Clean up logs + run: | + rm -rf .log + echo "Cleaned .log directory for fresh parallel execution test" + - name: Ray Cleanup shell: bash run: | From 911cbceb2c8d3cdeee76d92021977ef53fabf63d Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 18:28:52 +0000 Subject: [PATCH 14/25] Migrated all recent Lean code changes to src/itp_interface/lean --- .../lean4_local_data_extraction_transform.py | 0 src/itp_interface/lean/tactic_parser.py | 10 ---------- src/itp_interface/main/run_tool.py | 2 +- .../tools/run_data_generation_transforms.py | 2 +- 4 files changed, 2 insertions(+), 12 deletions(-) rename src/itp_interface/{tools => lean}/lean4_local_data_extraction_transform.py (100%) diff --git a/src/itp_interface/tools/lean4_local_data_extraction_transform.py b/src/itp_interface/lean/lean4_local_data_extraction_transform.py similarity index 100% rename from src/itp_interface/tools/lean4_local_data_extraction_transform.py rename to src/itp_interface/lean/lean4_local_data_extraction_transform.py diff --git a/src/itp_interface/lean/tactic_parser.py b/src/itp_interface/lean/tactic_parser.py index 9c8a6d1..58722ea 100644 --- a/src/itp_interface/lean/tactic_parser.py +++ b/src/itp_interface/lean/tactic_parser.py @@ -778,16 +778,6 @@ def print_errors(errors: List[ErrorInfo], logger: Optional[logging.Logger] = Non if errors: print(f"Error: {errors}") - pp = "src/data/test/Mathlib" - - with TacticParser(project_path=pp) as parser: - with open(os.path.join(pp, "Basic.lean")) as f: - lean_code = f.read() - tactics, errors = parser.parse(lean_code, fail_on_error=False) - print_tactics(tactics) - if errors: - print_errors(errors) - with TacticParser() as parser: # Example 1: Simple proof lean_code = "example : True := by trivial" diff --git a/src/itp_interface/main/run_tool.py b/src/itp_interface/main/run_tool.py index 4778480..769de16 100644 --- a/src/itp_interface/main/run_tool.py +++ b/src/itp_interface/main/run_tool.py @@ -34,7 +34,7 @@ from itp_interface.tools.coq_local_data_generation_transform import LocalDataGenerationTransform as CoqLocalDataGenerationTransform from itp_interface.tools.lean_local_data_generation_transform import LocalDataGenerationTransform as LeanLocalDataGenerationTransform from itp_interface.tools.lean4_local_data_generation_transform import Local4DataGenerationTransform -from itp_interface.tools.lean4_local_data_extraction_transform import Local4DataExtractionTransform +from itp_interface.lean.lean4_local_data_extraction_transform import Local4DataExtractionTransform from itp_interface.tools.isabelle_local_data_generation_transform import LocalDataGenerationTransform as IsabelleLocalDataGenerationTransform from itp_interface.tools.run_data_generation_transforms import RunDataGenerationTransforms from itp_interface.tools.log_utils import setup_logger diff --git a/src/itp_interface/tools/run_data_generation_transforms.py b/src/itp_interface/tools/run_data_generation_transforms.py index b4b0b48..c9baa8f 100644 --- a/src/itp_interface/tools/run_data_generation_transforms.py +++ b/src/itp_interface/tools/run_data_generation_transforms.py @@ -29,7 +29,7 @@ from itp_interface.tools.coq_local_data_generation_transform import LocalDataGenerationTransform as CoqLocalDataGenerationTransform from itp_interface.tools.lean_local_data_generation_transform import LocalDataGenerationTransform as LeanLocalDataGenerationTransform from itp_interface.tools.lean4_local_data_generation_transform import Local4DataGenerationTransform as Lean4LocalDataGenerationTransform -from itp_interface.tools.lean4_local_data_extraction_transform import Local4DataExtractionTransform as Lean4LocalDataExtractionTransform +from itp_interface.lean.lean4_local_data_extraction_transform import Local4DataExtractionTransform as Lean4LocalDataExtractionTransform from itp_interface.tools.isabelle_local_data_generation_transform import LocalDataGenerationTransform as IsabelleLocalDataGenerationTransform from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType From ecef84c13008f50fe96935b930c707e1724b5bec Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 18:53:59 +0000 Subject: [PATCH 15/25] Refactor path handling in tactic parser to use 'lean' directory instead of 'tools' --- src/itp_interface/lean/tactic_parser.py | 4 ++-- src/itp_interface/main/install.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/itp_interface/lean/tactic_parser.py b/src/itp_interface/lean/tactic_parser.py index 58722ea..36cb2c3 100644 --- a/src/itp_interface/lean/tactic_parser.py +++ b/src/itp_interface/lean/tactic_parser.py @@ -278,8 +278,8 @@ class RequestType(Enum): def get_path_to_tactic_parser_project() -> str: """Get the path to the tactic parser project directory.""" - tools_dir = os.path.dirname(__file__) - tactic_parser_path = os.path.join(tools_dir, "tactic_parser") + lean_dir = os.path.dirname(__file__) + tactic_parser_path = os.path.join(lean_dir, "tactic_parser") abs_path = os.path.abspath(tactic_parser_path) return abs_path diff --git a/src/itp_interface/main/install.py b/src/itp_interface/main/install.py index 9bad6ed..0c38ed0 100644 --- a/src/itp_interface/main/install.py +++ b/src/itp_interface/main/install.py @@ -20,8 +20,8 @@ def generate_random_string(length, allowed_chars=None): def install_itp_interface(): print("Installing itp_interface") itp_dir = os.path.dirname(os.path.dirname(file_path)) - tools_dir = os.path.join(itp_dir, "tools") - tactic_parser_dir = os.path.join(tools_dir, "tactic_parser") + lean_dir = os.path.join(itp_dir, "lean") + tactic_parser_dir = os.path.join(lean_dir, "tactic_parser") assert os.path.exists(tactic_parser_dir), f"tactic_parser_dir: {tactic_parser_dir} does not exist" assert os.path.exists(os.path.join(tactic_parser_dir, "lean-toolchain")), f"lean-toolchain does not exist in {tactic_parser_dir}, build has failed" print("tactic_parser_dir: ", tactic_parser_dir) From 30008fbbc75ca0af8b1d0398e66bd73e5eb40e1f Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 20:25:50 +0000 Subject: [PATCH 16/25] Enhance CI workflows by adding disk cleanup steps and reorganizing test execution for Lean and Coq; trimmed the GitHub CI --- .../github-build-actions-python314t.yaml | 66 ++++++++++++------- .github/workflows/github-build-actions.yaml | 62 +++++++++++------ 2 files changed, 86 insertions(+), 42 deletions(-) diff --git a/.github/workflows/github-build-actions-python314t.yaml b/.github/workflows/github-build-actions-python314t.yaml index 09c9c22..30d74ca 100644 --- a/.github/workflows/github-build-actions-python314t.yaml +++ b/.github/workflows/github-build-actions-python314t.yaml @@ -16,10 +16,23 @@ jobs: HOME: /root steps: + + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # 1. Aggressively remove unneeded pre-installed software + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Checkout repository uses: actions/checkout@v3 with: - submodules: true # Ensure submodules are checked out + submodules: false # Ensure submodules are checked out - name: Install Miniconda shell: bash @@ -90,19 +103,6 @@ jobs: source $HOME/.elan/env cd src/data/test/lean4_proj && lake exe cache get && lake build - - name: Check and Init opam version - run: | - opam --version - opam init --disable-sandboxing --yes - - - name: Install Coq - run: | - opam switch create simple_grp_theory 4.14.2 - opam switch simple_grp_theory - eval $(opam env) - opam repo add coq-released https://coq.inria.fr/opam/released - opam pin add -y coq-lsp 0.1.8+8.18 - - name: List repository files (debug step) run: find . -type f @@ -116,15 +116,13 @@ jobs: run: | rm -rf /tmp/* --verbose - - - name: Run Simple Env Test + - name: Run Simple Env Lean Test shell: bash run: | export PATH="$HOME/miniconda/bin:$PATH" source $HOME/miniconda/bin/activate py314-ft - eval $(opam env) source $HOME/.elan/env - python src/test/simple_env_test.py + python src/test/simple_env_lean_test.py - name: Clean up logs run: | @@ -136,13 +134,11 @@ jobs: run: | rm -rf /tmp/* --verbose - - name: Run Data Gen Test shell: bash run: | export PATH="$HOME/miniconda/bin:$PATH" source $HOME/miniconda/bin/activate py314-ft - eval $(opam env) source $HOME/.elan/env python src/test/simple_data_gen_test.py @@ -156,13 +152,11 @@ jobs: run: | rm -rf /tmp/* --verbose - - name: Run Data Extraction Test shell: bash run: | export PATH="$HOME/miniconda/bin:$PATH" source $HOME/miniconda/bin/activate py314-ft - eval $(opam env) source $HOME/.elan/env python src/test/simple_data_extract_test.py @@ -175,3 +169,31 @@ jobs: run: | rm -rf .log echo "Cleaned .log directory for fresh parallel execution test" + + - name: Clean up .lake in lean4_proj + shell: bash + run: | + rm -rf src/data/test/lean4_proj/.lake + echo "Cleaned .lake directory in lean4_proj for fresh parallel execution test" + + - name: Check and Init opam version + run: | + opam --version + opam init --disable-sandboxing --yes + + - name: Install Coq + run: | + opam switch create simple_grp_theory 4.14.2 + opam switch simple_grp_theory + eval $(opam env) + opam repo add coq-released https://coq.inria.fr/opam/released + opam pin add -y coq-lsp 0.1.8+8.18 + + - name: Run Simple Env Coq Test + shell: bash + run: | + export PATH="$HOME/miniconda/bin:$PATH" + source $HOME/miniconda/bin/activate py314-ft + eval $(opam env) + source $HOME/.elan/env + python src/test/simple_env_coq_test.py diff --git a/.github/workflows/github-build-actions.yaml b/.github/workflows/github-build-actions.yaml index e8d5e1a..bd81767 100644 --- a/.github/workflows/github-build-actions.yaml +++ b/.github/workflows/github-build-actions.yaml @@ -16,10 +16,22 @@ jobs: HOME: /root steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # 1. Aggressively remove unneeded pre-installed software + tool-cache: true + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Checkout repository uses: actions/checkout@v3 with: - submodules: true # Ensure submodules are checked out + submodules: false # Ensure submodules are checked out - name: Install Python and pip run: | @@ -59,19 +71,6 @@ jobs: source $HOME/.elan/env cd src/data/test/lean4_proj && lake exe cache get && lake build - - name: Check and Init opam version - run: | - opam --version - opam init --disable-sandboxing --yes - - - name: Install Coq - run: | - opam switch create simple_grp_theory 4.14.2 - opam switch simple_grp_theory - eval $(opam env) - opam repo add coq-released https://coq.inria.fr/opam/released - opam pin add -y coq-lsp 0.1.8+8.18 - - name: List repository files (debug step) run: find . -type f @@ -80,12 +79,11 @@ jobs: rm -rf .log echo "Cleaned .log directory for fresh parallel execution test" - - name: Run Simple Env Test + - name: Run Simple Env Lean Test shell: bash run: | - eval $(opam env) source $HOME/.elan/env - python src/test/simple_env_test.py + python src/test/simple_env_lean_test.py - name: Clean up logs run: | @@ -105,7 +103,6 @@ jobs: - name: Run Data Gen Test shell: bash run: | - eval $(opam env) source $HOME/.elan/env python src/test/simple_data_gen_test.py @@ -122,7 +119,6 @@ jobs: - name: Run Data Extraction Test shell: bash run: | - eval $(opam env) source $HOME/.elan/env python src/test/simple_data_extract_test.py @@ -134,4 +130,30 @@ jobs: - name: Ray Cleanup shell: bash run: | - rm -rf /tmp/* --verbose \ No newline at end of file + rm -rf /tmp/* --verbose + + - name: Clean up .lake in lean4_proj + shell: bash + run: | + rm -rf src/data/test/lean4_proj/.lake + echo "Cleaned .lake directory in lean4_proj for fresh parallel execution test" + + - name: Check and Init opam version + run: | + opam --version + opam init --disable-sandboxing --yes + + - name: Install Coq + run: | + opam switch create simple_grp_theory 4.14.2 + opam switch simple_grp_theory + eval $(opam env) + opam repo add coq-released https://coq.inria.fr/opam/released + opam pin add -y coq-lsp 0.1.8+8.18 + + - name: Run Simple Env Coq Test + shell: bash + run: | + eval $(opam env) + source $HOME/.elan/env + python src/test/simple_env_coq_test.py \ No newline at end of file From c01990b7d5efc23e079648b691706f4f2c529488 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 20:32:17 +0000 Subject: [PATCH 17/25] Remove disk cleanup step from CI workflows for Python build actions --- .../workflows/github-build-actions-python314t.yaml | 13 ------------- .github/workflows/github-build-actions.yaml | 12 ------------ 2 files changed, 25 deletions(-) diff --git a/.github/workflows/github-build-actions-python314t.yaml b/.github/workflows/github-build-actions-python314t.yaml index 30d74ca..4f90686 100644 --- a/.github/workflows/github-build-actions-python314t.yaml +++ b/.github/workflows/github-build-actions-python314t.yaml @@ -16,19 +16,6 @@ jobs: HOME: /root steps: - - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - # 1. Aggressively remove unneeded pre-installed software - tool-cache: true - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - name: Checkout repository uses: actions/checkout@v3 with: diff --git a/.github/workflows/github-build-actions.yaml b/.github/workflows/github-build-actions.yaml index bd81767..e63b419 100644 --- a/.github/workflows/github-build-actions.yaml +++ b/.github/workflows/github-build-actions.yaml @@ -16,18 +16,6 @@ jobs: HOME: /root steps: - - name: Free Disk Space (Ubuntu) - uses: jlumbroso/free-disk-space@main - with: - # 1. Aggressively remove unneeded pre-installed software - tool-cache: true - android: true - dotnet: true - haskell: true - large-packages: true - docker-images: true - swap-storage: true - - name: Checkout repository uses: actions/checkout@v3 with: From 136201c7ebdfd93d4cd37f8dcf10fec65c583f87 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 20:39:42 +0000 Subject: [PATCH 18/25] Add unit tests for Coq and Lean4 project building and proof execution --- src/test/simple_env_coq_test.py | 131 ++++++++++++++ ...le_env_test.py => simple_env_lean_test.py} | 169 ++++-------------- 2 files changed, 165 insertions(+), 135 deletions(-) create mode 100644 src/test/simple_env_coq_test.py rename src/test/{simple_env_test.py => simple_env_lean_test.py} (82%) diff --git a/src/test/simple_env_coq_test.py b/src/test/simple_env_coq_test.py new file mode 100644 index 0000000..77c5108 --- /dev/null +++ b/src/test/simple_env_coq_test.py @@ -0,0 +1,131 @@ +import unittest +import os + +def pretty_print(s1, s2, proof_step, done): + print(f"Current Goal:") + print('-'*30) + for goal in s1.training_data_format.start_goals: + hyps = '\n'.join([hyp for hyp in goal.hypotheses]) + print(hyps) + print('|- ', end='') + print(goal.goal) + print(f'*'*30) + print(f"="*30) + print(f"Action: {proof_step}") + print(f"="*30) + print(f"Next Goal:") + print('-'*30) + if s2 is not None: + for goal in s2.training_data_format.start_goals: + hyps = '\n'.join([hyp for hyp in goal.hypotheses]) + print(hyps) + print('|- ', end='') + print(goal.goal) + print(f'*'*30) + print(f"="*30) + print(f"DONE: {done}") + print('-'*30) + if s2 is None and done: + print("No more goals. Proof Finished!") + +class CoqHelper(): + def __init__(self): + self.current_switch = None + + def build_coq_project(self, project_folder): + try: + with os.popen("opam switch show") as proc: + self.current_switch = proc.read().strip() + except: + self.current_switch = None + # Check if the switch exists + # opam switch create simple_grp_theory 4.14.2 + if os.system("opam switch simple_grp_theory") != 0: + cmds = [ + 'opam switch create simple_grp_theory 4.14.2', + 'opam switch simple_grp_theory', + 'eval $(opam env)', + 'opam repo add coq-released https://coq.inria.fr/opam/released', + 'opam pin add -y coq-lsp 0.1.8+8.18' + ] + final_cmd = ' && '.join(cmds) + os.system(final_cmd) + # IMPORTANT NOTE: Make sure to switch to the correct switch before running the code. + os.system("opam switch simple_grp_theory && eval $(opam env)") + # Clean the project + os.system(f"eval $(opam env) && cd {project_folder} && make clean") + # Build the project + with os.popen(f"eval $(opam env) && cd {project_folder} && make") as proc: + print("Building Coq project...") + print('-'*15 + 'Build Logs' + '-'*15) + print(proc.read()) + print('-'*15 + 'End Build Logs' + '-'*15) + + def switch_to_current_switch(self): + if self.current_switch is not None: + try: + proc = os.popen(f"opam switch {self.current_switch} && eval $(opam env)") + print(proc.read()) + finally: + proc.close() + + +class CoqTest(unittest.TestCase): + def test_simple_coq(self): + from itp_interface.rl.proof_state import ProofState + from itp_interface.rl.proof_action import ProofAction + from itp_interface.rl.simple_proof_env import ProofEnv + from itp_interface.tools.proof_exec_callback import ProofExecutorCallback + from itp_interface.rl.simple_proof_env import ProofEnvReRankStrategy + project_folder = "src/data/test/coq/custom_group_theory/theories" + file_path = "src/data/test/coq/custom_group_theory/theories/grpthm.v" + # Build the project + # cd src/data/test/coq/custom_group_theory/theories && make + helper = CoqHelper() + helper.build_coq_project(project_folder) + language = ProofAction.Language.COQ + theorem_name = "algb_identity_sum" + # Theorem algb_identity_sum : + # forall a, algb_add a e = a. + proof_exec_callback = ProofExecutorCallback( + project_folder=project_folder, + file_path=file_path, + language=language, + always_use_retrieval=False, + keep_local_context=True + ) + always_retrieve_thms = False + retrieval_strategy = ProofEnvReRankStrategy.NO_RE_RANK + env = ProofEnv("test_coq", proof_exec_callback, theorem_name, retrieval_strategy=retrieval_strategy, max_proof_depth=10, always_retrieve_thms=always_retrieve_thms) + proof_steps = [ + 'intros.', + 'destruct a.', + '- reflexivity.', + '- reflexivity.' + ] + with env: + for proof_step in proof_steps: + state, _, next_state, _, done, info = env.step(ProofAction( + ProofAction.ActionType.RUN_TACTIC, + language, + tactics=[proof_step])) + if info.error_message is not None: + print(f"Error: {info.error_message}") + # This prints StateChanged, StateUnchanged, Failed, or Done + print(info.progress) + print('-'*30) + if done: + print("Proof Finished!!") + else: + s1 : ProofState = state + s2 : ProofState = next_state + pretty_print(s1, s2, proof_step, done) + helper.switch_to_current_switch() + + +def main(): + unittest.main() + + +if __name__ == '__main__': + main() diff --git a/src/test/simple_env_test.py b/src/test/simple_env_lean_test.py similarity index 82% rename from src/test/simple_env_test.py rename to src/test/simple_env_lean_test.py index b17a857..abc6d40 100644 --- a/src/test/simple_env_test.py +++ b/src/test/simple_env_lean_test.py @@ -29,10 +29,7 @@ def pretty_print(s1, s2, proof_step, done): if s2 is None and done: print("No more goals. Proof Finished!") -class Helper(): - def __init__(self): - self.current_switch = None - +class LeanHelper(): def build_lean4_project(self, project_folder): build_tactic_parser_if_needed() # Build the project @@ -41,43 +38,6 @@ def build_lean4_project(self, project_folder): build_lean4_project(project_folder) - def build_coq_project(self, project_folder): - try: - with os.popen("opam switch show") as proc: - self.current_switch = proc.read().strip() - except: - self.current_switch = None - # Check if the switch exists - # opam switch create simple_grp_theory 4.14.2 - if os.system("opam switch simple_grp_theory") != 0: - cmds = [ - 'opam switch create simple_grp_theory 4.14.2', - 'opam switch simple_grp_theory', - 'eval $(opam env)', - 'opam repo add coq-released https://coq.inria.fr/opam/released', - 'opam pin add -y coq-lsp 0.1.8+8.18' - ] - final_cmd = ' && '.join(cmds) - os.system(final_cmd) - # IMPORTANT NOTE: Make sure to switch to the correct switch before running the code. - os.system("opam switch simple_grp_theory && eval $(opam env)") - # Clean the project - os.system(f"eval $(opam env) && cd {project_folder} && make clean") - # Build the project - with os.popen(f"eval $(opam env) && cd {project_folder} && make") as proc: - print("Building Coq project...") - print('-'*15 + 'Build Logs' + '-'*15) - print(proc.read()) - print('-'*15 + 'End Build Logs' + '-'*15) - - def switch_to_current_switch(self): - if self.current_switch is not None: - try: - proc = os.popen(f"opam switch {self.current_switch} && eval $(opam env)") - print(proc.read()) - finally: - proc.close() - class Lean4Test(unittest.TestCase): def test_simple_lean4(self): from itp_interface.rl.proof_state import ProofState @@ -89,7 +49,7 @@ def test_simple_lean4(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"test3\"}' @@ -119,8 +79,8 @@ def test_simple_lean4(self): proof_was_finished = False for proof_step in proof_steps: state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -145,7 +105,7 @@ def test_lean4_backtracking(self): project_folder = "src/data/test/lean4_proj" file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"test3\"}' @@ -176,20 +136,20 @@ def test_lean4_backtracking(self): print(f"Backtracking at step {idx + 1} i.e. {proof_step}") state, _, next_state, _, done, info = env.step( ProofAction( - ProofAction.ActionType.BACKTRACK, + ProofAction.ActionType.BACKTRACK, language)) assert next_state == prev_state, "Backtracking failed" # Replay the last action last_proof_step = proof_steps[idx-1] state, _, next_state, _, done, info = env.step( ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[last_proof_step])) state, _, next_state, _, done, info = env.step( ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) prev_state = state if done: @@ -197,57 +157,6 @@ def test_lean4_backtracking(self): proof_was_finished = True assert proof_was_finished, "Proof was not finished" - def test_simple_coq(self): - from itp_interface.rl.proof_state import ProofState - from itp_interface.rl.proof_action import ProofAction - from itp_interface.rl.simple_proof_env import ProofEnv - from itp_interface.tools.proof_exec_callback import ProofExecutorCallback - from itp_interface.rl.simple_proof_env import ProofEnvReRankStrategy - project_folder = "src/data/test/coq/custom_group_theory/theories" - file_path = "src/data/test/coq/custom_group_theory/theories/grpthm.v" - # Build the project - # cd src/data/test/coq/custom_group_theory/theories && make - helper = Helper() - helper.build_coq_project(project_folder) - language = ProofAction.Language.COQ - theorem_name = "algb_identity_sum" - # Theorem algb_identity_sum : - # forall a, algb_add a e = a. - proof_exec_callback = ProofExecutorCallback( - project_folder=project_folder, - file_path=file_path, - language=language, - always_use_retrieval=False, - keep_local_context=True - ) - always_retrieve_thms = False - retrieval_strategy = ProofEnvReRankStrategy.NO_RE_RANK - env = ProofEnv("test_coq", proof_exec_callback, theorem_name, retrieval_strategy=retrieval_strategy, max_proof_depth=10, always_retrieve_thms=always_retrieve_thms) - proof_steps = [ - 'intros.', - 'destruct a.', - '- reflexivity.', - '- reflexivity.' - ] - with env: - for proof_step in proof_steps: - state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, - tactics=[proof_step])) - if info.error_message is not None: - print(f"Error: {info.error_message}") - # This prints StateChanged, StateUnchanged, Failed, or Done - print(info.progress) - print('-'*30) - if done: - print("Proof Finished!!") - else: - s1 : ProofState = state - s2 : ProofState = next_state - pretty_print(s1, s2, proof_step, done) - helper.switch_to_current_switch() - def test_simple_lean_calc(self): from itp_interface.rl.proof_state import ProofState from itp_interface.rl.proof_action import ProofAction @@ -258,7 +167,7 @@ def test_simple_lean_calc(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = "{\"namespace\":\"Lean4Proj1\",\"name\":\"test_calc\"}" @@ -290,8 +199,8 @@ def test_simple_lean_calc(self): proof_was_finished = False for proof_step in proof_steps: state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -319,7 +228,7 @@ def test_simple_lean_calc_with_validation(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = "{\"namespace\":\"Lean4Proj1\",\"name\":\"test_calc\"}" @@ -351,8 +260,8 @@ def test_simple_lean_calc_with_validation(self): proof_was_finished = False for proof_step in proof_steps: state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -386,7 +295,7 @@ def test_simple_lean_enforce_done_test(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = "{\"namespace\":\"Lean4Proj1\",\"name\":\"test_calc\"}" @@ -420,8 +329,8 @@ def test_simple_lean_enforce_done_test(self): proof_finished = False for proof_step in proof_steps: state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -450,7 +359,7 @@ def test_simple_lean4_done_test(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"test3\"}' @@ -477,8 +386,8 @@ def test_simple_lean4_done_test(self): with env: for proof_step in proof_steps: state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -502,7 +411,7 @@ def test_simple_lean4_have_test(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"imo_1959_p1\"}' @@ -540,8 +449,8 @@ def test_simple_lean4_have_test(self): env.set_max_proof_step_length(10000) for proof_step in proof_steps: state, m_action, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -560,7 +469,7 @@ def test_simple_lean4_have_test(self): s1 : ProofState = state s2 : ProofState = next_state pretty_print(s1, s2, proof_step, done) - + def test_simple_lean4_with_error(self): from itp_interface.rl.proof_state import ProofState from itp_interface.rl.proof_action import ProofAction @@ -571,7 +480,7 @@ def test_simple_lean4_with_error(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"test3\"}' @@ -599,8 +508,8 @@ def test_simple_lean4_with_error(self): with env: for i, proof_step in enumerate(proof_steps): state, _, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) if info.error_message is not None: print(f"Error: {info.error_message}") @@ -631,7 +540,7 @@ def test_simple_lean4_multiline_multigoal(self): file_path = "src/data/test/lean4_proj/Lean4Proj/Basic.lean" # Build the project # cd src/data/test/lean4_proj && lake build - helper = Helper() + helper = LeanHelper() helper.build_lean4_project(project_folder) language = ProofAction.Language.LEAN4 theorem_name = '{\"namespace\":\"Lean4Proj2\",\"name\":\"complicated_have\"}' @@ -659,8 +568,8 @@ def test_simple_lean4_multiline_multigoal(self): proof_was_finished = False for proof_step in proof_steps: state, action, next_state, _, done, info = env.step(ProofAction( - ProofAction.ActionType.RUN_TACTIC, - language, + ProofAction.ActionType.RUN_TACTIC, + language, tactics=[proof_step])) proof_step = action.kwargs.get('tactics', ['INVALID'])[0] if info.error_message is not None: @@ -677,20 +586,10 @@ def test_simple_lean4_multiline_multigoal(self): pretty_print(s1, s2, proof_step, done) assert proof_was_finished, "Proof was not finished" + def main(): unittest.main() - # Run only the Lean 4 tests - # t = Lean4Test() - # t.test_simple_lean4_multiline_multigoal() - # t.test_simple_lean4() - # t.test_lean4_backtracking() - # t.test_simple_lean4_done_test() - # t.test_simple_lean_calc() - # t.test_simple_lean_calc_with_validation() - # t.test_simple_lean4_with_error() - # t.test_simple_lean4_have_test() - # t.test_simple_lean_enforce_done_test() if __name__ == '__main__': - main() \ No newline at end of file + main() From 6a850cd4702166e359ca794db1a780997df4631d Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Wed, 19 Nov 2025 23:12:39 +0000 Subject: [PATCH 19/25] Enhance Lean parsing and database error handling - Added methods to check and strip Lean core file/module paths in Local4DataExtractionTransform. - Improved LeanDeclParser to extract declaration names and handle various declaration types. - Updated LeanDeclarationDB to include error handling for database queries. - Modified parsing_helpers_test to include additional test cases for new functionality. --- .../lean4_local_data_extraction_transform.py | 83 ++++++++++++- src/itp_interface/lean/parsing_helpers.py | 115 +++++++++++++++--- src/itp_interface/tools/simple_sqlite.py | 45 +++++-- src/test/parsing_helpers_test.py | 18 +-- 4 files changed, 224 insertions(+), 37 deletions(-) diff --git a/src/itp_interface/lean/lean4_local_data_extraction_transform.py b/src/itp_interface/lean/lean4_local_data_extraction_transform.py index 27e91f6..67bd6d4 100644 --- a/src/itp_interface/lean/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/lean/lean4_local_data_extraction_transform.py @@ -10,6 +10,7 @@ import uuid from pathlib import Path from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.parsing_helpers import LeanDeclParser, LeanParseResult, LeanDeclType from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType from itp_interface.tools.training_data_format import MergableCollection, TrainingDataMetadataFormat, ExtractionDataCollection from itp_interface.tools.training_data import TrainingData, DataLayoutFormat @@ -65,16 +66,71 @@ def _remove_lake_file_prefix(self, path: Path) -> str: return str(new_path) return str(path) + def _check_if_lean_core_file(self, file_path: Path) -> bool: + parts = file_path.parts + # If `leanprover--lean4---v4.24.0/src/lean/` is in the path, it's a core file + for i in range(len(parts) - 4): + if (parts[i].startswith("leanprover--lean4---v") and + parts[i+1] == "src" and + parts[i+2] == "lean"): + return True + return False + + def _check_if_lean_core_module(self, module_name: str) -> bool: + parts = module_name.split('.') + # leanprover--lean4---v4.24.0.src.lean + idx = None + for i in range(len(parts) - 2): + if parts[i].startswith("leanprover--lean4---v"): + idx = i + if idx is not None: + for i in range(idx, len(parts)): + if parts[i] == "src" and i + 1 < len(parts) and parts[i + 1] == "lean": + return True + return False + + def _strip_core_file_path(self, file_path: Path) -> str: + # Take only the parts after `leanprover--lean4---v4.24.0/src/lean/` + parts = file_path.parts + for i in range(len(parts) - 4): + if (parts[i].startswith("leanprover--lean4---v") and + parts[i+1] == "src" and + parts[i+2] == "lean"): + new_parts = parts[i+3:] + new_path = Path(*new_parts) + return str(new_path) + return str(file_path) + + def _strip_core_module_name(self, module_name: str) -> str: + parts = module_name.split('.') + # leanprover--lean4---v4.24.0.src.lean + idx: int | None = None + for i in range(len(parts) - 2): + if parts[i].startswith("leanprover--lean4---v"): + idx = i + if idx is not None: + for i in range(idx, len(parts)): + if parts[i] == "src" and i + 1 < len(parts) and parts[i + 1] == "lean": + new_parts = parts[i + 2:] + return '.'.join(new_parts) + return module_name + def _get_file_path(self, project_path: str, file_path: str) -> str: fp = Path(file_path) pp = Path(project_path) fp_abs = fp.resolve() pp_abs = pp.resolve() + if self._check_if_lean_core_file(fp_abs): + stripped_path = self._strip_core_file_path(fp_abs) + return stripped_path relative_path = fp_abs.relative_to(pp_abs) rel_file_path = self._remove_lake_file_prefix(relative_path) return str(rel_file_path) def _get_module_name(self, project_path: str, module_name: str) -> str: + if self._check_if_lean_core_module(module_name): + stripped_module = self._strip_core_module_name(module_name) + return stripped_module pp = Path(project_path) pp_abs = pp.resolve() pp_module = str(pp_abs).replace('/', '.') @@ -122,6 +178,28 @@ def _preprocess_declaration( self, project_path: str, decl: DeclWithDependencies) -> None: + # Filter all unknown types + if decl.decl_info.decl_type == LeanDeclType.UNKNOWN.value: + # Check if we have docstring or not + if decl.decl_info.doc_string is None or decl.decl_info.doc_string.strip() != "": + parser = LeanDeclParser(decl.decl_info.text) + parse_result = parser.parse() + if parse_result.doc_string is not None: + decl.decl_info.doc_string = parse_result.doc_string.strip() + if parse_result.decl_type != LeanDeclType.UNKNOWN: + decl.decl_info.decl_type = str(parse_result.decl_type) + if parse_result.text is not None: + full_text = [] + if parse_result.text_before is not None: + full_text.append(parse_result.text_before.strip()) + full_text.append(parse_result.text.strip()) + text = "\n".join(full_text) + decl.decl_info.text = text + # Update the proof if not already present + if decl.decl_info.proof is None and parse_result.proof is not None: + decl.decl_info.proof = parse_result.proof.strip() + if parse_result.name is not None: + decl.decl_info.name = parse_result.name.strip() pass @@ -159,6 +237,7 @@ def __call__(self, assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" last_decl_id = None + self._preprocess_declarations(project_path, file_dep_analyses) self._remap_local_dependency(project_path, file_dep_analyses) for fda in file_dep_analyses: @@ -221,7 +300,9 @@ def __call__(self, # project_dir = 'data/test/lean4_proj/' project_dir = 'data/test/Mathlib' # file_name = 'data/test/lean4_proj/Lean4Proj/Basic.lean' - file_name = 'data/test/Mathlib/.lake/packages/mathlib/Mathlib/Algebra/Divisibility/Basic.lean' + # file_name = 'data/test/Mathlib/.lake/packages/mathlib/Mathlib/Algebra/Divisibility/Basic.lean' + home_path = str(Path.home()) + file_name = f'{home_path}/.elan/toolchains/leanprover--lean4---v4.24.0/src/lean/Init/Prelude.lean' project_id = project_dir #.replace('/', '.') time_str = time.strftime("%Y%m%d-%H%M%S") output_path = f".log/local_data_generation_transform/data/{time_str}" diff --git a/src/itp_interface/lean/parsing_helpers.py b/src/itp_interface/lean/parsing_helpers.py index 8ff13ee..1e63c43 100644 --- a/src/itp_interface/lean/parsing_helpers.py +++ b/src/itp_interface/lean/parsing_helpers.py @@ -22,9 +22,13 @@ class LeanDeclType(Enum): MUTUAL = "mutual" UNKNOWN = "unknown" + def __str__(self) -> str: + return str(self.value).lower() + @dataclass class LeanParseResult: decl_type: LeanDeclType + name: Optional[str] = None text_before: Optional[str] = None doc_string: Optional[str] = None text: Optional[str] = None @@ -37,7 +41,6 @@ class LeanDeclParser: """ # Keywords that mark the start of a declaration - # We generate this set from the Enum values for fast lookup DECL_KEYWORDS = {m.value for m in LeanDeclType if m != LeanDeclType.UNKNOWN} # Types for which we should NOT attempt to extract a proof/body @@ -48,18 +51,24 @@ class LeanDeclParser: LeanDeclType.CLASS } + # Types that typically don't have a name + NO_NAME_TYPES = { + LeanDeclType.EXAMPLE, + LeanDeclType.MUTUAL, + LeanDeclType.UNKNOWN + } + def __init__(self, text: str): self.text = text self.n = len(text) self.tokens = [] self.docstring_range = None # (start, end) - self.docstring_content = None # Key Indices and Info self.decl_start = -1 self.proof_start = -1 - self.context_end = 0 self.decl_type: LeanDeclType = LeanDeclType.UNKNOWN + self.decl_name: Optional[str] = None def parse(self) -> LeanParseResult: self._tokenize() @@ -107,9 +116,9 @@ def _tokenize(self): else: i += 1 + # Capture the FIRST docstring found if is_doc and self.docstring_range is None: self.docstring_range = (start_idx, i) - self.docstring_content = self.text[start_idx:i] continue # 2. Handle Strings/Chars @@ -139,10 +148,16 @@ def _tokenize(self): char = self.text[i] # Nesting tracking - if char in "([{": nesting += 1 - elif char in ")]}": nesting = max(0, nesting - 1) + if char in "([{": + nesting += 1 + i += 1 + continue + elif char in ")]}": + nesting = max(0, nesting - 1) + i += 1 + continue - # Token detection + # Token detection (only at top level) if nesting == 0: # Check for 'in' keyword (standalone) if self._is_keyword_at(i, "in"): @@ -187,6 +202,7 @@ def _analyze_structure(self): """ candidate_decl = -1 decl_keyword_str = None + decl_keyword_end = -1 # Pass 1: Find Declaration Start for t_type, t_start, t_end in self.tokens: @@ -199,6 +215,7 @@ def _analyze_structure(self): candidate_decl = t_start if decl_keyword_str is None: decl_keyword_str = self.text[t_start:t_end] + decl_keyword_end = t_end elif t_type == "ATTR": if candidate_decl == -1: @@ -216,6 +233,10 @@ def _analyze_structure(self): else: self.decl_type = LeanDeclType.UNKNOWN + # Extract Name + if self.decl_type not in self.NO_NAME_TYPES and decl_keyword_end != -1: + self.decl_name = self._extract_name_after(decl_keyword_end) + # Pass 2: Find Proof Start skip_proof = self.decl_type in self.NO_PROOF_TYPES @@ -225,9 +246,71 @@ def _analyze_structure(self): self.proof_start = t_start break else: - # Fallback: No declaration found pass + def _extract_name_after(self, idx: int) -> Optional[str]: + """ + Finds the first identifier after the given index, skipping comments and whitespace. + Returns None if it hits a symbol (e.g. '(', '{', ':') before a name. + """ + i = idx + while i < self.n: + c = self.text[i] + + # Skip Whitespace + if c.isspace(): + i += 1 + continue + + # Skip Line Comments + if self.text.startswith("--", i): + i = self.text.find('\n', i) + if i == -1: return None + continue + + # Skip Block Comments + if self.text.startswith("/-", i): + # Quick skip for simple block comments, logic same as tokenizer + depth = 1 + i += 2 + while i < self.n and depth > 0: + if self.text.startswith("/-", i): + depth += 1 + i += 2 + elif self.text.startswith("-/", i): + depth -= 1 + i += 2 + else: + i += 1 + continue + + # Identifier Start Check + # Lean identifiers can be French-quoted «name» or standard + # If it starts with a symbol like (, {, [, :, it's anonymous + if not (c.isalnum() or c == '_' or c == '«'): + return None + + # Extract + start = i + if c == '«': + end = self.text.find('»', start) + if end != -1: + return self.text[start:end+1] + else: + # Malformed? Just return rest of line + return None + else: + # Standard identifier (alphanum + . + _) + while i < self.n: + curr = self.text[i] + if curr.isalnum() or curr == '_' or curr == '.': + i += 1 + else: + break + return self.text[start:i] + + return None + def _construct_result(self) -> LeanParseResult: # Case 1: No declaration found if self.decl_start == -1: @@ -249,7 +332,6 @@ def _construct_result(self) -> LeanParseResult: raw_decl = self.text[split_idx:decl_end] doc_content = None - # Handle Docstring Extraction logic if self.docstring_range: ds_start, ds_end = self.docstring_range doc_content = self.text[ds_start:ds_end] @@ -260,18 +342,13 @@ def _construct_result(self) -> LeanParseResult: pre_part2 = self.text[ds_end:split_idx] raw_pre = pre_part1 + pre_part2 - # Final cleanup - text_before = raw_pre.strip() or None - text = raw_decl.strip() or None - doc = doc_content or None - proof = proof_content or None - return LeanParseResult( decl_type=self.decl_type, - text_before=text_before, - doc_string=doc, - text=text, - proof=proof + name=self.decl_name, + text_before=raw_pre.strip() or None, + doc_string=doc_content or None, + text=raw_decl.strip() or None, + proof=proof_content or None ) # --- Helpers --- diff --git a/src/itp_interface/tools/simple_sqlite.py b/src/itp_interface/tools/simple_sqlite.py index 5eacefc..6c09486 100644 --- a/src/itp_interface/tools/simple_sqlite.py +++ b/src/itp_interface/tools/simple_sqlite.py @@ -697,20 +697,45 @@ def get_statistics(self) -> Dict[str, int]: stats = {} - cursor.execute("SELECT COUNT(*) FROM files") - stats['total_files'] = cursor.fetchone()[0] + try: + cursor.execute("SELECT COUNT(*) FROM files") + result = cursor.fetchone() + stats['total_files'] = result[0] if result else 0 + except Exception as e: + print(f"Warning: Error fetching files count: {e}") + stats['total_files'] = 0 - cursor.execute("SELECT COUNT(*) FROM declarations") - stats['total_declarations'] = cursor.fetchone()[0] + try: + cursor.execute("SELECT COUNT(*) FROM declarations") + result = cursor.fetchone() + stats['total_declarations'] = result[0] if result else 0 + except Exception as e: + print(f"Warning: Error fetching declarations count: {e}") + stats['total_declarations'] = 0 - cursor.execute("SELECT COUNT(*) FROM declaration_dependencies") - stats['total_dependencies'] = cursor.fetchone()[0] + try: + cursor.execute("SELECT COUNT(*) FROM declaration_dependencies") + result = cursor.fetchone() + stats['total_dependencies'] = result[0] if result else 0 + except Exception as e: + print(f"Warning: Error fetching dependencies count: {e}") + stats['total_dependencies'] = 0 - cursor.execute("SELECT COUNT(*) FROM declarations WHERE file_path IS NULL") - stats['unresolved_declarations'] = cursor.fetchone()[0] + try: + cursor.execute("SELECT COUNT(*) FROM declarations WHERE file_path IS NULL") + result = cursor.fetchone() + stats['unresolved_declarations'] = result[0] if result else 0 + except Exception as e: + print(f"Warning: Error fetching unresolved declarations count: {e}") + stats['unresolved_declarations'] = 0 - cursor.execute("SELECT COUNT(*) FROM imports") - stats['total_imports'] = cursor.fetchone()[0] + try: + cursor.execute("SELECT COUNT(*) FROM imports") + result = cursor.fetchone() + stats['total_imports'] = result[0] if result else 0 + except Exception as e: + print(f"Warning: Error fetching imports count: {e}") + stats['total_imports'] = 0 return stats diff --git a/src/test/parsing_helpers_test.py b/src/test/parsing_helpers_test.py index fe73b85..1726c05 100644 --- a/src/test/parsing_helpers_test.py +++ b/src/test/parsing_helpers_test.py @@ -29,28 +29,32 @@ def test_lean_declaration_parsing(self): ( "Run Cmd (Fallback)", "open Lean in\nrun_cmd Command.liftTermElabM do\n logInfo \"hi\"" + ), + ( + "Inductive Nat (hard case)", + "set_option genCtorIdx false in\n/--\nThe natural numbers, starting at zero.\n\nThis type is special-cased by both the kernel and the compiler, and overridden with an efficient\nimplementation. Both use a fast arbitrary-precision arithmetic library (usually\n[GMP](https://gmplib.org/)); at runtime, `Nat` values that are sufficiently small are unboxed.\n-/\ninductive Nat where\n /--\n Zero, the smallest natural number.\n\n Using `Nat.zero` explicitly should usually be avoided in favor of the literal `0`, which is the\n [simp normal form](lean-manual://section/simp-normal-forms).\n -/\n | zero : Nat\n /--\n The successor of a natural number `n`.\n\n Using `Nat.succ n` should usually be avoided in favor of `n + 1`, which is the [simp normal\n form](lean-manual://section/simp-normal-forms).\n -/\n | succ (n : Nat) : Nat\n" ) ] - print(f"\n{'TYPE':<15} | {'TEXT BEFORE':<20} | {'DOC':<10} | {'TEXT':<20} | {'PROOF':<15}") - print("-" * 105) + print(f"{'TYPE':<12} | {'NAME':<10} | {'TEXT BEFORE':<20} | {'DOC':<10} | {'TEXT':<20} | {'PROOF':<15}") + print("-" * 115) - for name, inp in test_cases: + for test_name, inp in test_cases: res = parse_lean_text(inp) - - # Access fields via dot notation + tp = res.decl_type.value + nm = res.name or "" tb = (res.text_before or "").replace('\n', '\\n') ds = (res.doc_string or "") tx = (res.text or "").replace('\n', '\\n') pf = (res.proof or "").replace('\n', '\\n') - + if len(tb) > 18: tb = tb[:18] + "..." if len(ds) > 8: ds = "/--...-/" if len(tx) > 18: tx = tx[:18] + "..." if len(pf) > 12: pf = pf[:12] + "..." - print(f"{tp:<15} | {tb:<20} | {ds:<10} | {tx:<20} | {pf:<15}") + print(f"{tp:<12} | {nm:<10} | {tb:<20} | {ds:<10} | {tx:<20} | {pf:<15}") # Basic assertions to verify parsing works self.assertIsNotNone(res.decl_type) From debf4f04eb1644c9a3f73067a84c1c273d6c96c9 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 01:58:17 +0000 Subject: [PATCH 20/25] Add scipy as an optional dependency; enhance dependency graph functions and UI for better declaration search --- pyproject.toml | 1 + src/app/data_explorer/graph_utils.py | 182 ++++++++++------- src/app/data_explorer/lean_db_explorer.py | 232 +++++++++++++++------- src/app/itp-gui/app.py | 2 +- 4 files changed, 276 insertions(+), 141 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5e1ec6a..0432338 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ dependencies = [ [project.optional-dependencies] app = [ "streamlit>=1.28.0", + "scipy>=1.16.0", "plotly>=5.17.0", "networkx>=3.1", "pandas>=2.0.0" diff --git a/src/app/data_explorer/graph_utils.py b/src/app/data_explorer/graph_utils.py index 5d0ed40..235773f 100644 --- a/src/app/data_explorer/graph_utils.py +++ b/src/app/data_explorer/graph_utils.py @@ -20,20 +20,26 @@ def build_dependency_graph(db: LeanDeclarationDB) -> nx.DiGraph: """ Build a directed graph from the dependency relationships. + Only includes declarations with non-None decl_type (filters out unresolved declarations). + This is a read-only operation - the database is not modified. + Args: db: LeanDeclarationDB instance Returns: - NetworkX directed graph where edges point from dependent to dependency - (A -> B means A depends on B) + NetworkX directed graph where edges point from dependency to dependent + (B -> A means "A depends on B") + - Root nodes (depend on nothing): in_degree == 0 + - Leaf nodes (nothing depends on them): out_degree == 0 """ G = nx.DiGraph() - # Add all declarations as nodes + # Add all declarations as nodes, excluding those with None decl_type cursor = db.conn.cursor() cursor.execute(""" SELECT decl_id, name, namespace, decl_type, file_path FROM declarations + WHERE decl_type IS NOT NULL """) for row in cursor.fetchall(): @@ -44,13 +50,18 @@ def build_dependency_graph(db: LeanDeclarationDB) -> nx.DiGraph: file_path=file_path) # Add all dependency edges + # Edge direction: to_id -> from_id means "from_id depends on to_id" + # This way, root nodes (declarations that depend on nothing) have in_degree == 0 cursor.execute(""" SELECT from_decl_id, to_decl_id FROM declaration_dependencies """) for from_id, to_id in cursor.fetchall(): - G.add_edge(from_id, to_id) + # Only add edge if both nodes exist in the graph (not filtered out) + if to_id in G.nodes and from_id in G.nodes: + # Flip edge direction: dependency -> dependent + G.add_edge(to_id, from_id) return G @@ -91,37 +102,51 @@ def get_dependencies_closure( Returns: Tuple of (list of declaration dicts, subgraph) """ - G = build_dependency_graph(db) - - if decl_id not in G: - return [], nx.DiGraph() - - # Get all reachable nodes (BFS limited by depth) - visited = set() - queue = [(decl_id, 0)] - visited.add(decl_id) - - while queue: - current, depth = queue.pop(0) - if depth >= max_depth: - continue - - for neighbor in G.successors(current): - if neighbor not in visited: - visited.add(neighbor) - queue.append((neighbor, depth + 1)) - - # Create subgraph - subgraph = G.subgraph(visited).copy() - - # Get declaration info for all nodes - decls = [] - for node_id in visited: - decl = db.get_declaration_by_decl_id(node_id) - if decl: - decls.append(decl) - - return decls, subgraph + try: + G = build_dependency_graph(db) + + if decl_id not in G: + return [], nx.DiGraph() + + # Get all dependencies (follow incoming edges since edge direction is flipped) + # Edge B -> A means "A depends on B", so dependencies of A are predecessors + visited = set() + queue = [(decl_id, 0)] + visited.add(decl_id) + + while queue: + current, depth = queue.pop(0) + if depth >= max_depth: + continue + + try: + # Follow incoming edges to find dependencies + for neighbor in G.predecessors(current): + if neighbor not in visited: + visited.add(neighbor) + queue.append((neighbor, depth + 1)) + except Exception as e: + print(f"Warning: Error processing predecessors of {current}: {e}") + continue + + # Create subgraph + subgraph = G.subgraph(visited).copy() + + # Get declaration info for all nodes + decls = [] + for node_id in visited: + try: + decl = db.get_declaration_by_decl_id(node_id) + if decl: + decls.append(decl) + except Exception as e: + print(f"Warning: Error fetching declaration {node_id}: {e}") + continue + + return decls, subgraph + except Exception as e: + print(f"Error in get_dependencies_closure: {e}") + raise def get_dependents_closure( @@ -140,37 +165,51 @@ def get_dependents_closure( Returns: Tuple of (list of declaration dicts, subgraph) """ - G = build_dependency_graph(db) - - if decl_id not in G: - return [], nx.DiGraph() - - # Get all nodes that can reach this node (reverse BFS) - visited = set() - queue = [(decl_id, 0)] - visited.add(decl_id) - - while queue: - current, depth = queue.pop(0) - if depth >= max_depth: - continue - - for neighbor in G.predecessors(current): - if neighbor not in visited: - visited.add(neighbor) - queue.append((neighbor, depth + 1)) - - # Create subgraph - subgraph = G.subgraph(visited).copy() - - # Get declaration info for all nodes - decls = [] - for node_id in visited: - decl = db.get_declaration_by_decl_id(node_id) - if decl: - decls.append(decl) - - return decls, subgraph + try: + G = build_dependency_graph(db) + + if decl_id not in G: + return [], nx.DiGraph() + + # Get all dependents (follow outgoing edges since edge direction is flipped) + # Edge B -> A means "A depends on B", so dependents of B are successors + visited = set() + queue = [(decl_id, 0)] + visited.add(decl_id) + + while queue: + current, depth = queue.pop(0) + if depth >= max_depth: + continue + + try: + # Follow outgoing edges to find dependents + for neighbor in G.successors(current): + if neighbor not in visited: + visited.add(neighbor) + queue.append((neighbor, depth + 1)) + except Exception as e: + print(f"Warning: Error processing successors of {current}: {e}") + continue + + # Create subgraph + subgraph = G.subgraph(visited).copy() + + # Get declaration info for all nodes + decls = [] + for node_id in visited: + try: + decl = db.get_declaration_by_decl_id(node_id) + if decl: + decls.append(decl) + except Exception as e: + print(f"Warning: Error fetching declaration {node_id}: {e}") + continue + + return decls, subgraph + except Exception as e: + print(f"Error in get_dependents_closure: {e}") + raise def visualize_graph(G: nx.DiGraph, title: str = "Dependency Graph") -> go.Figure: @@ -255,8 +294,7 @@ def visualize_graph(G: nx.DiGraph, title: str = "Dependency Graph") -> go.Figure # Create figure fig = go.Figure(data=[edge_trace, node_trace], layout=go.Layout( - title=title, - titlefont_size=16, + title=dict(text=title, font=dict(size=16)), showlegend=False, hovermode='closest', margin=dict(b=0, l=0, r=0, t=40), @@ -272,7 +310,7 @@ def get_forest_statistics(G: nx.DiGraph, forest: Set[str]) -> Dict[str, Any]: Get statistics for a forest (connected component). Args: - G: NetworkX directed graph + G: NetworkX directed graph (edge B->A means "A depends on B") forest: Set of decl_ids in the forest Returns: @@ -280,10 +318,12 @@ def get_forest_statistics(G: nx.DiGraph, forest: Set[str]) -> Dict[str, Any]: """ subgraph = G.subgraph(forest) - # Find root nodes (nodes with no incoming edges from within forest) + # Find root nodes (declarations that depend on nothing) + # With edge direction B->A meaning "A depends on B", roots have in_degree == 0 roots = [node for node in forest if subgraph.in_degree(node) == 0] - # Find leaf nodes (nodes with no outgoing edges from within forest) + # Find leaf nodes (declarations that nothing depends on) + # With edge direction B->A meaning "A depends on B", leaves have out_degree == 0 leaves = [node for node in forest if subgraph.out_degree(node) == 0] return { diff --git a/src/app/data_explorer/lean_db_explorer.py b/src/app/data_explorer/lean_db_explorer.py index 9c9eae4..675189b 100644 --- a/src/app/data_explorer/lean_db_explorer.py +++ b/src/app/data_explorer/lean_db_explorer.py @@ -158,9 +158,20 @@ def tab_dependencies(db: LeanDeclarationDB): """Dependency Explorer tab interface.""" st.header("🌲 Dependency Explorer") - # Declaration selector + # Declaration selector with option to search by name or decl_id st.markdown("**Select a declaration:**") - decl_name = st.text_input("Declaration name", placeholder="e.g., dvd_trans, Nat.add") + + search_mode = st.radio( + "Search by", + ["Declaration name", "Declaration ID"], + horizontal=True, + help="Search by name or directly by decl_id" + ) + + if search_mode == "Declaration name": + decl_input = st.text_input("Declaration name", placeholder="e.g., dvd_trans, Nat.add") + else: + decl_input = st.text_input("Declaration ID", placeholder="e.g., Nat.add_comm_123abc") col1, col2, col3 = st.columns(3) @@ -172,47 +183,67 @@ def tab_dependencies(db: LeanDeclarationDB): max_depth = st.number_input("Max depth", min_value=1, max_value=20, value=5) if show_deps or show_dependents: - if not decl_name: - st.warning("Please enter a declaration name") + if not decl_input: + st.warning(f"Please enter a declaration {search_mode.lower()}") return - # Find declaration by name - search_df = db_utils.search_declarations(db, name_pattern=decl_name) + # Find declaration by name or ID + decl_id = None + display_name = decl_input - if search_df.empty: - st.error(f"Declaration '{decl_name}' not found") - return + if search_mode == "Declaration name": + search_df = db_utils.search_declarations(db, name_pattern=decl_input) - if len(search_df) > 1: - st.warning(f"Found {len(search_df)} declarations with this name. Using the first one.") - st.dataframe(search_df[['name', 'namespace', 'file_path', 'decl_type']]) + if search_df.empty: + st.error(f"Declaration '{decl_input}' not found") + return - decl_id = search_df.iloc[0]['decl_id'] + if len(search_df) > 1: + st.warning(f"Found {len(search_df)} declarations with this name. Using the first one.") + st.dataframe(search_df[['decl_id', 'name', 'namespace', 'file_path', 'decl_type']]) - with st.spinner("Analyzing dependencies..."): - if show_deps: - decls, subgraph = graph_utils.get_dependencies_closure(db, decl_id, max_depth) - title = f"Dependencies of {decl_name}" + decl_id = search_df.iloc[0]['decl_id'] + display_name = search_df.iloc[0]['name'] + else: + # Direct decl_id lookup + decl = db.get_declaration_by_decl_id(decl_input) + if decl: + decl_id = decl_input + display_name = decl.get('name', decl_input) + # Show the declaration info + st.info(f"Found: {display_name} (Type: {decl.get('decl_type', 'unknown')})") else: - decls, subgraph = graph_utils.get_dependents_closure(db, decl_id, max_depth) - title = f"Dependents of {decl_name}" + st.error(f"Declaration with ID '{decl_input}' not found") + return - if decls: - st.success(f"Found {len(decls)} related declarations") - - # Show tabs for table and graph views - tab1, tab2 = st.tabs(["📊 Table View", "📈 Graph View"]) - - with tab1: - df = pd.DataFrame(decls) - st.dataframe(df[['name', 'namespace', 'decl_type', 'file_path', 'line']], - use_container_width=True, height=400) - - with tab2: - fig = graph_utils.visualize_graph(subgraph, title) - st.plotly_chart(fig, use_container_width=True) - else: - st.info("No dependencies found") + try: + with st.spinner("Analyzing dependencies..."): + if show_deps: + decls, subgraph = graph_utils.get_dependencies_closure(db, decl_id, max_depth) + title = f"Dependencies of {display_name}" + else: + decls, subgraph = graph_utils.get_dependents_closure(db, decl_id, max_depth) + title = f"Dependents of {display_name}" + + if decls: + st.success(f"Found {len(decls)} related declarations") + + # Show tabs for table and graph views + tab1, tab2 = st.tabs(["📊 Table View", "📈 Graph View"]) + + with tab1: + df = pd.DataFrame(decls) + st.dataframe(df[['decl_id', 'name', 'namespace', 'decl_type', 'file_path', 'line']], + use_container_width=True, height=400) + + with tab2: + fig = graph_utils.visualize_graph(subgraph, title) + st.plotly_chart(fig, use_container_width=True) + else: + st.info("No dependencies found") + except Exception as e: + st.error(f"Error analyzing dependencies: {str(e)}") + st.exception(e) def tab_forests(db: LeanDeclarationDB): @@ -221,7 +252,8 @@ def tab_forests(db: LeanDeclarationDB): st.markdown(""" A **forest** is a connected component in the dependency graph. Declarations in the same - forest are connected through dependency relationships. + forest are connected through dependency relationships. **Root nodes** are declarations + with no dependencies within the forest (no incoming edges). """) # Initialize session state for forests @@ -263,7 +295,7 @@ def tab_forests(db: LeanDeclarationDB): st.dataframe(df, use_container_width=True) # Show declarations in selected forest - st.markdown("### View Forest Declarations") + st.markdown("### View Forest Details") forest_id = st.number_input( "Forest ID to view", min_value=1, @@ -271,40 +303,102 @@ def tab_forests(db: LeanDeclarationDB): value=1 ) - if st.button("Show Declarations in Forest"): + col1, col2 = st.columns(2) + with col1: + show_all_decls = st.button("Show All Declarations") + with col2: + show_roots = st.button("Show Root Nodes Only") + + if show_all_decls or show_roots: + if G is None: + st.error("Graph not available. Please click 'Find All Forests' first.") + return + forest = forests[forest_id - 1] - st.info(f"Forest #{forest_id} contains {len(forest)} declarations") - - # Get all declarations in this forest - forest_decls = [] - for decl_id in forest: - decl = db.get_declaration_by_decl_id(decl_id) - if decl: - # Truncate text and proof for display - text = decl.get('text', '') - proof = decl.get('proof', '') - forest_decls.append({ - 'decl_id': decl['decl_id'], - 'name': decl['name'], - 'namespace': decl.get('namespace', ''), - 'decl_type': decl.get('decl_type', ''), - 'text': text[:100] + '...' if text and len(text) > 100 else text, - 'proof': proof[:100] + '...' if proof and len(proof) > 100 else proof, - 'file_path': decl.get('file_path', ''), - 'line': decl.get('line', '') - }) - - forest_df = pd.DataFrame(forest_decls) - st.dataframe(forest_df, use_container_width=True, height=500) + stats = graph_utils.get_forest_statistics(G, forest) - # Download button - csv = forest_df.to_csv(index=False) - st.download_button( - label="Download Forest Declarations", - data=csv, - file_name=f"forest_{forest_id}_declarations.csv", - mime="text/csv" - ) + if show_roots: + st.markdown(f"#### 🌱 Root Nodes of Forest #{forest_id}") + st.info(f"Found {stats['num_roots']} root nodes (declarations with no dependencies)") + + # Get root node details + root_decls = [] + subgraph = G.subgraph(forest) + root_ids = [node for node in forest if subgraph.in_degree(node) == 0] + + for decl_id in root_ids: + decl = db.get_declaration_by_decl_id(decl_id) + if decl: + # Count direct dependents + num_dependents = subgraph.out_degree(decl_id) + root_decls.append({ + 'decl_id': decl['decl_id'], + 'name': decl['name'], + 'namespace': decl.get('namespace', ''), + 'decl_type': decl.get('decl_type', ''), + 'dependents_count': num_dependents, + 'file_path': decl.get('file_path', ''), + 'line': decl.get('line', '') + }) + + if root_decls: + root_df = pd.DataFrame(root_decls) + # Sort by number of dependents + root_df = root_df.sort_values('dependents_count', ascending=False) + st.dataframe(root_df, use_container_width=True, height=500) + + # Download button + csv = root_df.to_csv(index=False) + st.download_button( + label="Download Root Nodes", + data=csv, + file_name=f"forest_{forest_id}_roots.csv", + mime="text/csv" + ) + else: + st.warning("No root nodes found in this forest.") + + else: # show_all_decls + st.info(f"Forest #{forest_id} contains {len(forest)} declarations") + + # Get all declarations in this forest + forest_decls = [] + subgraph = G.subgraph(forest) + + for decl_id in forest: + decl = db.get_declaration_by_decl_id(decl_id) + if decl: + # Check if it's a root node + is_root = subgraph.in_degree(decl_id) == 0 + # Truncate text and proof for display + text = decl.get('text', '') + proof = decl.get('proof', '') + forest_decls.append({ + 'decl_id': decl['decl_id'], + 'name': decl['name'], + 'namespace': decl.get('namespace', ''), + 'decl_type': decl.get('decl_type', ''), + 'is_root': '🌱' if is_root else '', + 'text': text[:100] + '...' if text and len(text) > 100 else text, + 'proof': proof[:100] + '...' if proof and len(proof) > 100 else proof, + 'file_path': decl.get('file_path', ''), + 'line': decl.get('line', '') + }) + + if forest_decls: + forest_df = pd.DataFrame(forest_decls) + st.dataframe(forest_df, use_container_width=True, height=500) + + # Download button + csv = forest_df.to_csv(index=False) + st.download_button( + label="Download Forest Declarations", + data=csv, + file_name=f"forest_{forest_id}_declarations.csv", + mime="text/csv" + ) + else: + st.warning("No declarations found in this forest.") else: st.info("Click 'Find All Forests' to analyze the dependency graph") diff --git a/src/app/itp-gui/app.py b/src/app/itp-gui/app.py index a169744..1f42a92 100644 --- a/src/app/itp-gui/app.py +++ b/src/app/itp-gui/app.py @@ -19,7 +19,7 @@ from typing import Optional, Dict, Any, List import json -from itp_interface.tools.simple_lean4_sync_executor import SimpleLean4SyncExecutor +from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.lean_server.lean_context import ProofContext app = Flask(__name__, static_folder='static', template_folder='templates') From d037f4cfbb4adb61a4fdb7e9003440aadd7ced68 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 01:58:52 +0000 Subject: [PATCH 21/25] Enhance Lean4 data extraction with dependency mapping and configuration updates; add new YAML configs for benchmarks --- .../lean4_local_data_extraction_transform.py | 69 ++++++++++-- src/itp_interface/lean/parsing_helpers.py | 1 - src/itp_interface/main/config.py | 5 +- .../benchmark/stdlib_benchmark_lean_ext.yaml | 31 ++++++ .../configs/stdlib_lean_data_extract.yaml | 13 +++ src/itp_interface/main/run_tool.py | 44 ++++++-- .../tools/run_data_generation_transforms.py | 30 ++++- src/itp_interface/tools/simple_sqlite.py | 104 +++++++++++------- 8 files changed, 233 insertions(+), 64 deletions(-) create mode 100644 src/itp_interface/main/configs/benchmark/stdlib_benchmark_lean_ext.yaml create mode 100644 src/itp_interface/main/configs/stdlib_lean_data_extract.yaml diff --git a/src/itp_interface/lean/lean4_local_data_extraction_transform.py b/src/itp_interface/lean/lean4_local_data_extraction_transform.py index 67bd6d4..3b6be6d 100644 --- a/src/itp_interface/lean/lean4_local_data_extraction_transform.py +++ b/src/itp_interface/lean/lean4_local_data_extraction_transform.py @@ -8,7 +8,9 @@ sys.path.append(root_dir) import typing import uuid +import json from pathlib import Path +from filelock import FileLock from itp_interface.lean.simple_lean4_sync_executor import SimpleLean4SyncExecutor from itp_interface.lean.parsing_helpers import LeanDeclParser, LeanParseResult, LeanDeclType from itp_interface.tools.coq_training_data_generator import GenericTrainingDataGenerationTransform, TrainingDataGenerationType @@ -25,13 +27,29 @@ def __init__(self, logger = None, max_parallelism : int = 4, db_path : typing.Optional[str] = None, - enable_file_export: bool = True): + enable_file_export: bool = False, + enable_dependency_extraction: bool = False): super().__init__(TrainingDataGenerationType.LOCAL, buffer_size, logger) self.depth = depth self.max_search_results = max_search_results self.max_parallelism = max_parallelism self.db_path = db_path # Store path, don't create connection yet (for Ray actors) self.enable_file_export = enable_file_export + self.enable_dependency_extraction = enable_dependency_extraction + # Everything except UNKNOWN + self.supported_declaration_types = [ str(dt) for dt in LeanDeclType if dt != LeanDeclType.UNKNOWN] + if self.db_path is not None: + temp_db_path = Path(self.db_path) + cache_path = temp_db_path.parent / "cache" + os.makedirs(str(cache_path), exist_ok=True) + mapping_path = cache_path / f"{temp_db_path.stem}_declaration_mapping.json" + self.cache_path = str(cache_path) + self.mapping_path = str(mapping_path) + self._lock_path = os.path.join(self.cache_path, "lean_declaration_mapping.lock") + else: + self.cache_path = None + self.mapping_path = None + self._lock_path = None def get_meta_object(self) -> TrainingDataMetadataFormat: return TrainingDataMetadataFormat( @@ -147,11 +165,12 @@ def _remap_local_dependency(self, project_path: str, file_dep_analyses: typing.List[FileDependencyAnalysis]) -> None: # Map local dependencies - all_decls = {} + all_decls : dict[str, DeclWithDependencies] = {} for fda in file_dep_analyses: for decl in fda.declarations: name = decl.decl_info.name - all_decls[name] = decl + if name is not None: + all_decls[name] = decl for fda in file_dep_analyses: fda_rel_path = self._get_file_path(project_path, fda.file_path) # Remove the pp_module prefix from the fda module name @@ -159,7 +178,10 @@ def _remap_local_dependency(self, for decl in fda.declarations: for decl_dep in decl.dependencies: if decl_dep.name in all_decls: - if decl_dep.file_path is None: + local_decl = all_decls[decl_dep.name] + if decl_dep.file_path is None and \ + local_decl.decl_info.decl_type in self.supported_declaration_types: + # We need to ensure that it is locally defined. decl_dep.file_path = fda_rel_path if decl_dep.module_name is None: decl_dep.module_name = fda_module_name @@ -228,10 +250,34 @@ def __call__(self, else: theorems = set(theorems) if theorems is not None else None cnt = 0 - temp_dir = os.path.join(training_data.folder, "temp") - os.makedirs(temp_dir, exist_ok=True) + temp_dir = os.path.join(training_data.folder, "temp") if self.cache_path is None else self.cache_path + temp_dir = temp_dir.rstrip('/') + if self.cache_path is None: + os.makedirs(temp_dir, exist_ok=True) json_output_path = f"{temp_dir}/{file_namespace.replace('.', '_')}.lean.deps.json" - file_dep_analyses = lean_executor.extract_all_theorems_and_definitions(json_output_path=json_output_path) + already_mapped = False + if self._lock_path is not None: + with FileLock(self._lock_path): + assert self.mapping_path is not None + if not os.path.exists(self.mapping_path): + with open(self.mapping_path, "w") as f: + json.dump({}, f) + with open(self.mapping_path, "r") as f: + mapping_data = json.load(f) + if json_output_path in mapping_data: + already_mapped = True + else: + mapping_data[json_output_path] = True + with open(self.mapping_path, "w") as f: + json.dump(mapping_data, f) + if already_mapped and os.path.exists(json_output_path): + # Read from existing file + self.logger.info(f"Using existing dependency extraction file: {json_output_path}") + with open(json_output_path, 'r') as f: + data = json.load(f) + file_dep_analyses = [FileDependencyAnalysis.model_validate(data)] + else: + file_dep_analyses = lean_executor.extract_all_theorems_and_definitions(json_output_path=json_output_path) self.logger.info(f"Extracted {len(file_dep_analyses)} FileDependencyAnalysis objects from {file_namespace}") self.logger.info(f"file_dep_analyses: {file_dep_analyses}") assert len(file_dep_analyses) == 1, "Expected exactly one FileDependencyAnalysis object" @@ -253,15 +299,18 @@ def __call__(self, for decl in fda.declarations: # Get or create decl_id from database (or generate new one if no DB) if db: + if decl.decl_info.decl_type not in self.supported_declaration_types: + self.logger.info(f"Skipping declaration '{decl.decl_info.name}' of unsupported type '{decl.decl_info.decl_type}'") + continue decl_id = db.process_declaration( fda_file_path=fda_rel_path, fda_module_name=fda_module_name, - decl=decl + decl=decl, + enable_dependency_extraction=self.enable_dependency_extraction ) self.logger.info(f"Processed declaration '{decl.decl_info.name}' with ID: {decl_id}") else: # Fallback: generate unique ID without database - import uuid timestamp = str(int(uuid.uuid1().time_low)) random_id = str(uuid.uuid4()) decl_id = f"{timestamp}_{random_id}" @@ -277,7 +326,7 @@ def __call__(self, decl.decl_id = decl_id new_fda.declarations.append(decl) training_data.merge(new_fda) - cnt += 1 + cnt += 1 last_decl_id = decl_id if last_decl_id: diff --git a/src/itp_interface/lean/parsing_helpers.py b/src/itp_interface/lean/parsing_helpers.py index 1e63c43..de16b24 100644 --- a/src/itp_interface/lean/parsing_helpers.py +++ b/src/itp_interface/lean/parsing_helpers.py @@ -8,7 +8,6 @@ class LeanDeclType(Enum): LEMMA = "lemma" THEOREM = "theorem" DEF = "def" - DEFINITION = "definition" STRUCTURE = "structure" CLASS = "class" INDUCTIVE = "inductive" diff --git a/src/itp_interface/main/config.py b/src/itp_interface/main/config.py index c4239e1..86e49f2 100644 --- a/src/itp_interface/main/config.py +++ b/src/itp_interface/main/config.py @@ -80,6 +80,7 @@ class EvalDataset(object): project: str files: typing.Union[typing.List[EvalFile], typing.List[ExtractFile]] exclude_files: typing.List[str] = field(default_factory=list) + include_files: typing.List[str] = field(default_factory=list) @dataclass_json @dataclass @@ -193,7 +194,9 @@ def parse_config(cfg): eval_datasets.append(EvalDataset( project=dataset_cfg["project"], files=eval_files, - exclude_files=dataset_cfg.get("exclude_files", []))) + exclude_files=dataset_cfg.get("exclude_files", []), + include_files=dataset_cfg.get("include_files", []) + )) language = ProofAction.Language(benchmark_cfg["language"]) benchmark = EvalBenchmark( name=benchmark_cfg["name"], diff --git a/src/itp_interface/main/configs/benchmark/stdlib_benchmark_lean_ext.yaml b/src/itp_interface/main/configs/benchmark/stdlib_benchmark_lean_ext.yaml new file mode 100644 index 0000000..fde9d37 --- /dev/null +++ b/src/itp_interface/main/configs/benchmark/stdlib_benchmark_lean_ext.yaml @@ -0,0 +1,31 @@ +name: stdlib_benchmark_lean_ext +num_files: 1 +language: LEAN4 +few_shot_data_path_for_retrieval: +few_shot_metadata_filename_for_retrieval: +dfs_data_path_for_retrieval: +dfs_metadata_filename_for_retrieval: +is_extraction_request: true +datasets: + - project: src/data/test/batteries + files: [] + exclude_files: + - src/data/test/batteries/.lake + - src/data/test/batteries/.vscode + - src/data/test/batteries/BatteriesTest + - src/data/test/batteries/Batteries + - src/data/test/batteries/scripts + - src/data/test/batteries/docs + - src/data/test/batteries/Shake + - src/data/test/batteries/Batteries.lean + - src/data/test/batteries/bors.toml + - src/data/test/batteries/lake-manifest.toml + - src/data/test/batteries/.github + - src/data/test/batteries/.docker + - src/data/test/batteries/lakefile.toml + - src/data/test/batteries/.gitpod.yml + - src/data/test/batteries/.github + - src/data/test/batteries/lake-manifest.json + include_files: + - ~/.elan/toolchains/leanprover--lean4---v4.24.0/src/lean/Init + - ~/.elan/toolchains/leanprover--lean4---v4.24.0/src/lean/Std \ No newline at end of file diff --git a/src/itp_interface/main/configs/stdlib_lean_data_extract.yaml b/src/itp_interface/main/configs/stdlib_lean_data_extract.yaml new file mode 100644 index 0000000..2269810 --- /dev/null +++ b/src/itp_interface/main/configs/stdlib_lean_data_extract.yaml @@ -0,0 +1,13 @@ +defaults: + # - benchmark: simple_benchmark_lean_training_data + # - run_settings: default_lean_data_generation_transforms + # - benchmark: simple_benchmark_1 + # - run_settings: default_lean4_data_generation_transforms + - benchmark: stdlib_benchmark_lean_ext + - run_settings: default_lean4_data_generation_transforms + - env_settings: no_retrieval + - override hydra/job_logging: 'disabled' + +run_settings: + output_dir: .log/data_generation/benchmark/stdlib_benchmark_lean_ext + pool_size: 12 \ No newline at end of file diff --git a/src/itp_interface/main/run_tool.py b/src/itp_interface/main/run_tool.py index 769de16..24c0366 100644 --- a/src/itp_interface/main/run_tool.py +++ b/src/itp_interface/main/run_tool.py @@ -121,10 +121,23 @@ def _get_all_lemmas_impl( logger.info(f"Discovered {len(lemmas_to_prove)} lemmas") return lemmas_to_prove - def _get_all_lean_files_in_folder_recursively( - project_folder: str, exclude_list: typing.List[str]) -> typing.List[str]: + project_folder: str, exclude_list: typing.List[str], include_list: typing.List[str]) -> typing.List[str]: lean_files = [] + for include in include_list: + # the include list should be a full path + include_path = Path(include) + include_path = include_path.expanduser().resolve() + # Get all files recursively under include_path + for root, dirs, files in os.walk(include_path): + for file in files: + if file.endswith(".lean"): + full_path = os.path.join(root, file) + full_path_obj = Path(full_path) + full_path = full_path_obj.expanduser().resolve() + if str(full_path) not in lean_files: + lean_files.append(str(full_path)) + project_folder_path = Path(project_folder) for root, dirs, files in os.walk(project_folder): for file in files: @@ -270,13 +283,30 @@ def add_transform(experiment: Experiments, clone_dir: str, resources: list, tran elif experiment.benchmark.language == ProofAction.Language.LEAN4: if experiment.benchmark.is_extraction_request: output_dir_path = os.path.join(experiment.run_settings.output_dir, str_time) - os.makedirs(output_dir_path, exist_ok=True) - db_path = os.path.join(output_dir_path, "lean4_extraction_db.sqlite") - transform = Local4DataExtractionTransform( + db_path = None + if os.environ.get("EXTRACTION_DB_PATH") is not None: + db_path = os.environ.get("EXTRACTION_DB_PATH", "") + if not os.path.exists(db_path) and db_path.strip() != "": + db_path = None + if db_path is None: + os.makedirs(output_dir_path, exist_ok=True) + db_path = os.path.join(output_dir_path, "lean4_extraction_db.sqlite") + transform1 = Local4DataExtractionTransform( experiment.run_settings.dep_depth, buffer_size=experiment.run_settings.buffer_size, logger=logger, - db_path=db_path) + db_path=db_path, + enable_file_export=False, + enable_dependency_extraction=False) + transforms.append(transform1) # First just add the definitions + transform = Local4DataExtractionTransform( + experiment.run_settings.dep_depth, + buffer_size=experiment.run_settings.buffer_size, + logger=logger, + db_path=db_path, + enable_file_export=True, + enable_dependency_extraction=True + ) # This will be later appended to the transforms list else: transform = Local4DataGenerationTransform( experiment.run_settings.dep_depth, @@ -335,7 +365,7 @@ def get_decl_lemmas_to_parse( if len(dataset.files) == 0: logger.warning(f"No files specified for Lean4 extraction in dataset {dataset.project}, extracting from all Lean4 files in the project") # List all the files recursively in the project folder - files_in_dataset = _get_all_lean_files_in_folder_recursively(dataset.project, dataset.exclude_files) + files_in_dataset = _get_all_lean_files_in_folder_recursively(dataset.project, dataset.exclude_files, dataset.include_files) logger.info(f"Found {len(files_in_dataset)} Lean4 files in the project {dataset.project}") for file_path in files_in_dataset: file_to_theorems[file_path] = ["*"] diff --git a/src/itp_interface/tools/run_data_generation_transforms.py b/src/itp_interface/tools/run_data_generation_transforms.py index c9baa8f..bacbb25 100644 --- a/src/itp_interface/tools/run_data_generation_transforms.py +++ b/src/itp_interface/tools/run_data_generation_transforms.py @@ -10,6 +10,7 @@ import typing import shutil import gc +from pathlib import Path from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError from itp_interface.tools.training_data import TrainingData, DataLayoutFormat @@ -290,6 +291,7 @@ def run_local_transform(self, pool_size: int , transform: typing.Union[CoqLocalD job_idx = 0 project_names = list(projects.keys()) project_names.sort() + file_index = 0 for project in project_names: # Create temporary directory for each project proj_name = os.path.basename(project) @@ -305,9 +307,26 @@ def run_local_transform(self, pool_size: int , transform: typing.Union[CoqLocalD some_files_processed = True job_more_args = file_args.get(file_path, {}) # Create temporary directory for each file - full_file_path = os.path.join(project_path, file_path) - relative_file_path = file_path - relative_file_path = relative_file_path.replace("/", ".").replace(".v", "").replace(".lean", "").replace(".thy", "") + fp = Path(file_path) + pp = Path(project_path) + if fp.is_absolute(): + # then leave it as is + full_file_path = file_path + else: + # Check if the file path is relative to the project path + fp = fp.resolve() + pp = pp.resolve() + # Now check if fp is relative to pp + fp_is_rel_to_pp = fp.is_relative_to(pp) + if fp_is_rel_to_pp: + full_file_path = str(fp.relative_to(pp)) + else: + # Just make it relative to project path + full_file_path = os.path.join(project_path, file_path) + assert os.path.exists(full_file_path), f"File path {full_file_path} does not exist" + file_index += 1 + + relative_file_path = f"transformer_{file_index}" temp_file_dir = os.path.join(temp_project_dir, relative_file_path) os.makedirs(temp_file_dir, exist_ok=True) log_file = os.path.join(self.logging_dir, f"{relative_file_path}.log") @@ -410,10 +429,13 @@ def _transform_output(results): shutil.rmtree(temp_output_dir) def run_all_local_transforms(self, pool_size: int, projects: typing.Dict[str, typing.Dict[str, str]], use_human_readable: bool, new_output_dir: str, log_error: bool, other_args: typing.Dict[str, typing.Dict[str, dict]] = {}): + os.makedirs(new_output_dir, exist_ok=True) for idx, transform in enumerate(self.transforms): last_transform = idx == len(self.transforms) - 1 save_transform = self.save_intermidiate_transforms or last_transform - self.run_local_transform(pool_size, transform, projects, use_human_readable, new_output_dir, log_error, save_transform, preserve_temp=self.save_intermidiate_transforms, other_args=other_args) + temp_new_output_dir = str(Path(new_output_dir) / str(idx)) + os.makedirs(temp_new_output_dir, exist_ok=True) + self.run_local_transform(pool_size, transform, projects, use_human_readable, temp_new_output_dir, log_error, save_transform, preserve_temp=self.save_intermidiate_transforms, other_args=other_args) pass # Create Ray remote version if Ray is available diff --git a/src/itp_interface/tools/simple_sqlite.py b/src/itp_interface/tools/simple_sqlite.py index 6c09486..fd8aa47 100644 --- a/src/itp_interface/tools/simple_sqlite.py +++ b/src/itp_interface/tools/simple_sqlite.py @@ -193,7 +193,8 @@ def get_or_create_decl_id( name: str, namespace: Optional[str] = None, file_path: Optional[str] = None, - module_name: Optional[str] = None + module_name: Optional[str] = None, + assert_exists: bool = False ) -> str: """ Get existing decl_id or create a new one for a declaration. @@ -217,15 +218,21 @@ def get_or_create_decl_id( cursor.execute(""" SELECT decl_id FROM declarations WHERE name = ? - AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL)) - AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL)) - AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL)) - """, (name, namespace, namespace, file_path, file_path, module_name, module_name)) + AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL) OR (namespace IS NOT NULL AND ? IS NULL)) + AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL) OR (file_path IS NOT NULL AND ? IS NULL)) + AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL) OR (module_name IS NOT NULL AND ? IS NULL)) + """, (name, + namespace, namespace, namespace, + file_path, file_path, file_path, + module_name, module_name, module_name)) row = cursor.fetchone() if row: return row[0] # Return existing ID + if assert_exists: + raise ValueError(f"Declaration not found: name={name}, namespace={namespace}, file_path={file_path}, module_name={module_name}") + # Generate new ID and insert minimal record new_decl_id = self._generate_unique_id() @@ -241,10 +248,13 @@ def get_or_create_decl_id( cursor.execute(""" SELECT decl_id FROM declarations WHERE name = ? - AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL)) - AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL)) - AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL)) - """, (name, namespace, namespace, file_path, file_path, module_name, module_name)) + AND (namespace IS ? OR (namespace IS NULL AND ? IS NULL) OR (namespace IS NOT NULL AND ? IS NULL)) + AND (file_path IS ? OR (file_path IS NULL AND ? IS NULL) OR (file_path IS NOT NULL AND ? IS NULL)) + AND (module_name IS ? OR (module_name IS NULL AND ? IS NULL) OR (module_name IS NOT NULL AND ? IS NULL)) + """, (name, + namespace, namespace, namespace, + file_path, file_path, file_path, + module_name, module_name, module_name)) row = cursor.fetchone() if row: return row[0] @@ -355,7 +365,8 @@ def process_declaration( self, fda_file_path: str, fda_module_name: str, - decl + decl, + enable_dependency_extraction: bool = True ) -> str: """ Process a declaration from a FileDependencyAnalysis object. @@ -375,44 +386,55 @@ def process_declaration( """ # Get or create ID for this declaration decl_id = self.get_or_create_decl_id( - name=decl.decl_info.name, - namespace=decl.decl_info.namespace, - file_path=fda_file_path, - module_name=fda_module_name - ) - - # Update with full declaration info - self.upsert_declaration_full_info( - decl_id=decl_id, name=decl.decl_info.name, namespace=decl.decl_info.namespace, file_path=fda_file_path, module_name=fda_module_name, - decl_type=decl.decl_info.decl_type, - text=decl.decl_info.text, - line=decl.decl_info.line, - column=decl.decl_info.column, - end_line=decl.decl_info.end_line, - end_column=decl.decl_info.end_column, - doc_string=decl.decl_info.doc_string, - proof=decl.decl_info.proof + # If we are extracting dependencies, we expect the declaration to exist + assert_exists=enable_dependency_extraction ) - # Process dependencies - for dep in decl.dependencies: - # Get or create ID for the dependency - dep_decl_id = self.get_or_create_decl_id( - name=dep.name, - namespace=dep.namespace, - file_path=dep.file_path, - module_name=dep.module_name + if not enable_dependency_extraction: + # Update with full declaration info + # This is a new declaration, so we can safely update all info + self.upsert_declaration_full_info( + decl_id=decl_id, + name=decl.decl_info.name, + namespace=decl.decl_info.namespace, + file_path=fda_file_path, + module_name=fda_module_name, + decl_type=decl.decl_info.decl_type, + text=decl.decl_info.text, + line=decl.decl_info.line, + column=decl.decl_info.column, + end_line=decl.decl_info.end_line, + end_column=decl.decl_info.end_column, + doc_string=decl.decl_info.doc_string, + proof=decl.decl_info.proof ) - - # Propagate the decl_id back to the dependency object - dep.decl_id = dep_decl_id - - # Insert dependency edge - self.insert_dependency_edge(decl_id, dep_decl_id) + else: + # Process dependencies + for dep in decl.dependencies: + # Get or create ID for the dependency + try: + dep_decl_id = self.get_or_create_decl_id( + name=dep.name, + namespace=dep.namespace, + file_path=dep.file_path, + module_name=dep.module_name, + # At this point all dependencies should exist + assert_exists=True + ) + except ValueError as e: + # This dependency is probably from outside the project + # Let's just skip it + continue + + # Propagate the decl_id back to the dependency object + dep.decl_id = dep_decl_id + + # Insert dependency edge + self.insert_dependency_edge(decl_id, dep_decl_id) return decl_id From 40a2e971f4fc30f311b1c31f93ce00e1cffcd352 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 04:17:29 +0000 Subject: [PATCH 22/25] Reduce pool size from 2 to 1 in Lean data extraction and generation configurations --- src/itp_interface/main/configs/simple_lean_data_extract.yaml | 2 +- src/itp_interface/main/configs/simple_lean_data_gen.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/itp_interface/main/configs/simple_lean_data_extract.yaml b/src/itp_interface/main/configs/simple_lean_data_extract.yaml index 4fed34c..590c3e9 100644 --- a/src/itp_interface/main/configs/simple_lean_data_extract.yaml +++ b/src/itp_interface/main/configs/simple_lean_data_extract.yaml @@ -10,4 +10,4 @@ defaults: run_settings: output_dir: .log/data_generation/benchmark/simple_benchmark_lean_ext - pool_size: 2 \ No newline at end of file + pool_size: 1 \ No newline at end of file diff --git a/src/itp_interface/main/configs/simple_lean_data_gen.yaml b/src/itp_interface/main/configs/simple_lean_data_gen.yaml index d93dd0f..882a810 100644 --- a/src/itp_interface/main/configs/simple_lean_data_gen.yaml +++ b/src/itp_interface/main/configs/simple_lean_data_gen.yaml @@ -10,4 +10,4 @@ defaults: run_settings: output_dir: .log/data_generation/benchmark/simple_benchmark_lean - pool_size: 2 \ No newline at end of file + pool_size: 1 \ No newline at end of file From 7efcab43f0a5b1ca092a5f5b5acfca0c1f682254 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 04:30:27 +0000 Subject: [PATCH 23/25] Update test data paths to include subdirectory for training data --- src/test/simple_data_extract_test.py | 2 +- src/test/simple_data_gen_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/simple_data_extract_test.py b/src/test/simple_data_extract_test.py index 36f1dca..08a0c98 100644 --- a/src/test/simple_data_extract_test.py +++ b/src/test/simple_data_extract_test.py @@ -63,7 +63,7 @@ def test_lean_data_extract(self): # Print the directory contents last_dir_path = os.path.join(".log/data_generation/benchmark/simple_benchmark_lean_ext", last_dir) print("Last Directory Contents:", os.listdir(last_dir_path)) - train_data = os.path.join(last_dir_path, "train") + train_data = os.path.join(last_dir_path, "train", "0") list_files = os.listdir(train_data) print("Train Directory Contents:", list_files) data_files = [f for f in list_files if f.endswith(".json") and f.startswith("local_data_")] diff --git a/src/test/simple_data_gen_test.py b/src/test/simple_data_gen_test.py index e39ce93..48fd226 100644 --- a/src/test/simple_data_gen_test.py +++ b/src/test/simple_data_gen_test.py @@ -62,7 +62,7 @@ def test_proof_step_data_gen(self): # Print the directory contents last_dir_path = os.path.join(".log/data_generation/benchmark/simple_benchmark_lean", last_dir) print("Last Directory Contents:", os.listdir(last_dir_path)) - train_data = os.path.join(last_dir_path, "train") + train_data = os.path.join(last_dir_path, "train", "0") list_files = os.listdir(train_data) print("Train Directory Contents:", list_files) data_files = [f for f in list_files if f.endswith(".json") and f.startswith("local_data_")] From 2dcb084cf21785f2715ab4dd00fc61c16489dd7b Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 04:47:24 +0000 Subject: [PATCH 24/25] Refine file filtering in pretty_print_file_contents and update train data path in test case --- src/test/simple_data_extract_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/simple_data_extract_test.py b/src/test/simple_data_extract_test.py index 08a0c98..07aba42 100644 --- a/src/test/simple_data_extract_test.py +++ b/src/test/simple_data_extract_test.py @@ -17,7 +17,7 @@ def pretty_print_file_contents(dir_path): print(f"Printing all files in the directory: {dir_path}") for f in os.listdir(dir_path): file_path = os.path.join(dir_path, f) - if os.path.isfile(file_path): + if os.path.isfile(file_path) and any(file_path.endswith(ext) for ext in [".json", ".yaml", ".yml", ".txt", ".log"]): print('-'*50) print(f"Contents of {file_path}:") with open(file_path, "r") as file: @@ -63,7 +63,7 @@ def test_lean_data_extract(self): # Print the directory contents last_dir_path = os.path.join(".log/data_generation/benchmark/simple_benchmark_lean_ext", last_dir) print("Last Directory Contents:", os.listdir(last_dir_path)) - train_data = os.path.join(last_dir_path, "train", "0") + train_data = os.path.join(last_dir_path, "train", "1") list_files = os.listdir(train_data) print("Train Directory Contents:", list_files) data_files = [f for f in list_files if f.endswith(".json") and f.startswith("local_data_")] From d5b60058dc1cd3363c255eab7c2d2b04be217c09 Mon Sep 17 00:00:00 2001 From: Amitayush Thakur Date: Fri, 21 Nov 2025 05:06:47 +0000 Subject: [PATCH 25/25] Bump version from 1.2.0 to 1.3.0 in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0432338..6cc0954 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ build-backend = "hatchling.build" [project] name = "itp_interface" -version = "1.2.0" +version = "1.3.0" authors = [ { name="Amitayush Thakur", email="amitayush@utexas.edu" }, ]