trishullab · GeorgeTsoukalas · May 12, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/frame/configs/funsearch-mid.yaml b/frame/configs/funsearch-mid.yaml
@@ -15,18 +15,18 @@ defaults:
 
 programs_database:
   functions_per_prompt: 2
-  num_islands: 2
-  reset_period: 3600 # 1 hour in seconds
-  cluster_sampling_temperature_init: 0.1
-  cluster_sampling_temperature_period: 30000
+  num_islands: 4
+  reset_period: 360000 # 1 hour in seconds
+  cluster_sampling_temperature_init: 0.01
+  cluster_sampling_temperature_period: 1000
 samples_per_prompt: 2
 num_samplers: 2 # Single-threaded setup
 num_evaluators: 2 # Single-threaded setup
 
 # LLM Config (For frame.tools.llm_caller)
 llm:
-  model_name: "gpt-4o-mini" # Less expensive model
-  temperature: 0.8
+  model_name: "gpt-4o" # Less expensive model
+  temperature: 1.0
   top_p: 0.95
   max_tokens: 1000 # Reduced token count
   # API key should be set via environment variable OPENAI_API_KEY
@@ -39,8 +39,8 @@ prompt_yaml_path: ${hydra:runtime.cwd}/frame/configs/prompts/interestingness_pro
 # tb_config_path: ??? # REQUIRED: Set via command line: python -m frame.funsearch.main tb_config_path=/path/to/your/tb_config.yaml
 tb_config_path: "${hydra:runtime.cwd}/frame/configs/succ_zero_generate_interestingness_funsearch.yaml" # Provide the default (which is the generate_interestingness exp)
 frame: 
-  evaluation_episodes_M: 8 # Reduced episodes count (M)
-  evaluation_timeout_seconds: 30 # Reduced timeout to 5 minutes
+  evaluation_episodes_M: 64 # Reduced episodes count (M)
+  evaluation_timeout_seconds: 120 # Reduced timeout to 5 minutes
   save_eval_visualizations: true # Flag to enable visualizations during evaluation
 
 # Template/Specification File
@@ -60,15 +60,15 @@ iterations: 128
 # --- FunSearch Abstraction Settings ---
 abstraction:
   enabled: true                      # Master switch: set to false to disable the entire abstraction mechanism
-  frequency: 5                       # Run the abstraction step every N main FunSearch iterations (low for testing).
+  frequency: 4                       # Run the abstraction step every N main FunSearch iterations (low for testing).
   programs_to_sample: 3              # How many top programs to analyze (low for testing).
   max_abstractions_per_step: 2       # Target new abstractions per step (low for testing).
   # Optional LLM override block for abstraction task
   llm:
     model_name: null                 # Default: null (use main llm config)
     temperature: null                # Default: null (use main llm config)
     top_p: null                      # Default: null (use main llm config)
-    max_tokens: 4096                 # Can override max tokens if needed
+    max_tokens: 8192                 # Can override max tokens if needed
   prompt_yaml_path: "${hydra:runtime.cwd}/frame/configs/prompts/abstraction_prompt.yaml" # Specific instructions for generating abstractions
 
 # Hydra settings

diff --git a/frame/configs/funsearch-low.yaml → frame/configs/funsearch-test.yaml b/frame/configs/funsearch-low.yaml → frame/configs/funsearch-test.yaml
@@ -10,21 +10,21 @@ defaults:
   - _self_
 
 programs_database:
-  functions_per_prompt: 1
+  functions_per_prompt: 2
   num_islands: 1
   reset_period: 3600 # 1 hour in seconds
-  cluster_sampling_temperature_init: 0.1
+  cluster_sampling_temperature_init: 0.01
   cluster_sampling_temperature_period: 30000
 samples_per_prompt: 2
 num_samplers: 1 # Single-threaded setup
 num_evaluators: 1 # Single-threaded setup
 
 # LLM Config (For frame.tools.llm_caller)
 llm:
-  model_name: "gpt-4o-mini" # Less expensive model
-  temperature: 0.8
+  model_name: "gpt-4o" # Less expensive model
+  temperature: 1.0
   top_p: 0.95
-  max_tokens: 1000 # Reduced token count
+  max_tokens: 8192 # Reduced token count
   # API key should be set via environment variable OPENAI_API_KEY
 
 # Path to the interestingness prompt YAML file
@@ -36,7 +36,7 @@ prompt_yaml_path: ${hydra:runtime.cwd}/frame/configs/prompts/interestingness_pro
 tb_config_path: "${hydra:runtime.cwd}/frame/configs/succ_zero_generate_interestingness_funsearch.yaml" # Provide the default (which is the generate_interestingness exp)
 frame: 
   evaluation_episodes_M: 1 # Reduced episodes count (M)
-  evaluation_timeout_seconds: 300 # Reduced timeout to 5 minutes
+  evaluation_timeout_seconds: 15 # Reduced timeout to 5 minutes
   save_eval_visualizations: true # Flag to enable visualizations during evaluation
 
 # Template/Specification File
@@ -54,7 +54,7 @@ evaluations_dir: "${hydra:runtime.output_dir}/evaluations"
 initial_state: "${hydra:runtime.cwd}/frame/configs/theory_building/initial_state/succ_zero_eq.yaml"
 
 # Iterations of the Funsearch sampling loop
-iterations: 8
+iterations: 2
 
 # --- FunSearch Abstraction Settings ---
 abstraction:

diff --git a/frame/configs/models/gpt4o-mini.yaml b/frame/configs/models/gpt4o-mini.yaml
@@ -1,4 +1,4 @@
-name: gpt-4o-mini
+name: gpt-4o
 provider: openai
 temperature: 0.9
 max_tokens: 8192

diff --git a/frame/configs/prompts/abstraction_prompt.yaml b/frame/configs/prompts/abstraction_prompt.yaml
@@ -16,8 +16,12 @@ abstraction_user_prompt_template: |
   Do not return any abstractions which are just the calculate_interestingness function themselves. Do not return any abstractions which do error-handling, or anything else unrelated to calculation ofthe interestingness.
   Keep the documentation concise, explain the arguments and if the function outputs more than a float value (like a dictionary), explain what it returns in a condensed manner.
   Do not name any abstractions you make the same name as any of the abstractions already provided, and do not name it `calculate_interestingness`
-  Make sure the abstractions you generate are syntactically correct and do not call variables which don't exist.
-
+  Importantly, please do not make abstractions which seem to do things already handled by other abstractions! Do not "abstract" the abstractions themselves, and do not make an entire given program an abstraction, you can and should do smaller subroutines.
+  Make sure the abstractions you generate are syntactically correct and do not call variables which don't exist. The abstractions you choose do not have to be common to many of the given functions, they should just abstract out useful operations which are not already handled by other abstractions.
+  You can compose abstractions if you think it makes sense to do so, but importantly, make sure to keep the abstractions *general* - do not hard-code values when they could be passed as arguments, for instance.
+  Keep in mind the examples in general of an entity will be a tuple of ints. YOU MUST PROVIDE THE IMPLEMENTATION OF THE ABSTRACTIONS YOU RETURN!
+  Also, make the description you provide a clear description of what operation is carried out in the implementation, for example if a measure involves a logarithm operation to implement a penalty, mention that in the description.
+
   Here are the functions you are given:
   {program_examples}
 

diff --git a/frame/configs/prompts/interestingness_prompt.yaml b/frame/configs/prompts/interestingness_prompt.yaml
@@ -36,7 +36,8 @@ one_shot_prompt:
   - You **MUST** respond with **only** the complete, syntactically correct Python code for the new function (`calculate_interestingness_vN`).
   - Include the `def calculate_interestingness_vN(...):` signature line and the function body. Add a concise docstring.
   - **DO NOT** include any introductory text, explanations, comments outside the function body, or usage examples in your response.
-  - If you use any of the primitives or abstractions, make sure you use them correctly. Provide the right inputs as described in the documentation given about the primitives!"
+  - If you use any of the primitives or abstractions, make sure you use them correctly. Provide the right inputs as described in the documentation given about the primitives!
+  - The descriptions of the primitives and abstractions indicate what arguments they take. Follow proper Python syntax. Watch out for potential division by zero errors."
 
   template: |
     def calculate_interestingness(entity_id: str, graph: KnowledgeGraph) -> float:
@@ -70,7 +71,7 @@ funsearch_system_prompt: |
 
   Your specific task is to **generate a new, potentially improved version** of the `calculate_interestingness` function, named `calculate_interestingness_vN`. You should **analyze all the example functions** provided in the user prompt (`_v0` to `_v(N-1)`) to understand different successful strategies and potentially combine or adapt their ideas.
 
-  The function you write will receive `entity_id` (string) and `graph` (a `KnowledgeGraph` object) as input. You can use the following methods on the `graph` object to get information about the entity (`entity_id`) or the graph itself:
+  The function you write will receive `entity_id` (string) and `graph` (a `KnowledgeGraph` object) as input. You can use the following methods on the `graph` object to get information about the entity (`entity_id`) or the graph itself, the description explains what arguments it takes:
   {primitives_section}
 
   {abstraction_section}
@@ -84,7 +85,12 @@ funsearch_system_prompt: |
   - Include the `def calculate_interestingness_vN(...):` signature line and the function body. Add a concise docstring.
   - **DO NOT** include any introductory text, explanations, comments outside the function body, or usage examples in your response.
   - Enclose the entire function definition within a single markdown code block like this:
-  - If you use any of the primitives or abstractions, make sure you use them correctly. Provide the right inputs as described in the documentation above!
+  - If you use any of the primitives or abstractions, make sure you use them correctly by supplying the proper arguments. 
+  - Try not to rely on the abstractions alone - use them in a compositional way, where you also implement some of the logic yourself (passing interesting arguments to the abstractions counts).
+  - Try not to copy the examples exactly, but rather use them as inspiration to create a new, better, function that *can* be similar.
+  - You do not have to use all primitives, and you do not have to make extremely complex functions if you don't think it necessary.
+  - Watch out for potential division by zero errors.
+
   ```python
   def calculate_interestingness_vN(entity_id: str, graph: KnowledgeGraph) -> float:
       """A new function version inspired by provided examples."""
@@ -134,13 +140,13 @@ primitive_categories:
     primitives:
       - name: "get_entity_node_type"
         return_type: "float"
-        description: "(entity_id, graph): Returns numeric type (Concept=1, Conj=2, Thm=3)."
+        description: "(entity_id, graph): Returns 'Concept', 'Conjecture', or 'Theorem' depending on the type of the entity."
       - name: "get_concept_category"
         return_type: "float"
-        description: "(entity_id, graph): Returns numeric concept category (Predicate=1, Func=2,...)."
+        description: "(entity_id, graph): Returns 'Predicate', 'Function', or 'Constant' depending on the type of the entity."
       - name: "get_input_arity"
         return_type: "int"
-        description: "(entity_id, graph): Returns input arity."
+        description: "(entity_id, graph): Returns input arity of the entity.."
       - name: "get_num_component_types"
         return_type: "int"
         description: "(entity_id, graph): Returns number of component types in examples."
@@ -149,11 +155,11 @@ primitive_categories:
     description: "These primitives extract information about examples and non-examples of the entity."
     primitives:
       - name: "get_examples"
-        return_type: "List[Example]"
-        description: "(entity_id, graph): Returns list of positive examples."
+        return_type: "List[Tuple[Int]]"
+        description: "(entity_id, graph): Returns list of positive examples, each example is a tuple of ints."
       - name: "get_nonexamples"
-        return_type: "List[Example]"
-        description: "(entity_id, graph): Returns list of negative examples."
+        return_type: "List[Tuple[Int]]"
+        description: "(entity_id, graph): Returns list of negative examples, each example is a tuple of ints."
 
   "Construction Step Primitives":
     description: "These primitives extract information about how the entity was constructed."

diff --git a/frame/configs/succ_zero_base.yaml b/frame/configs/succ_zero_base.yaml
@@ -4,7 +4,7 @@
 defaults:
   - theory_building/production_rules: default  # Use all available rules
   - theory_building/interestingness: default  # Use default interestingness measures
-  - theory_building/initial_state: succ_zero # Default initial state
+  - theory_building/initial_state: succ_zero_eq # Default initial state
   # Note(George; 4/25): This is the initial state that will be used, can be subbed out for: succ_zero, succ_zero_eq and arithmetic_basic
   - interestingness: default  # Use default HR interestingness functions
   - _self_
@@ -15,9 +15,9 @@ experiment:
   description: "Discover mathematical concepts starting from just successor and zero"
   num_episodes: 1
   seed: 0
-  max_steps: 80  # Default max steps
-  episode_timeout_seconds: 60 # Timeout for the main process waiting for a worker episode result
-  num_workers: 1 # Number of parallel worker processes
+  max_steps: 1000  # Default max steps
+  episode_timeout_seconds: 120 # Timeout for the main process waiting for a worker episode result
+  num_workers: 64 # Number of parallel worker processes
 
 # Timeout settings
 timeouts:

diff --git a/frame/configs/succ_zero_generate_interestingness_funsearch.yaml b/frame/configs/succ_zero_generate_interestingness_funsearch.yaml
@@ -24,11 +24,11 @@ initial_state:
 experiment:
   name: "succ_zero_multi_generated_interestingness_eval"
   description: "Generate N interestingness functions and evaluate each M times, starting from successor and zero."
-  max_steps: 80
-  num_episodes: 20
-  num_workers: 8
-  seed: ${oc.env:RANDOM_SEED,12345}
-  episode_timeout_seconds: 15 # Timeout for the main process waiting for a worker episode result
+  max_steps: 1000
+  num_episodes: 64
+  num_workers: 64
+  seed: 0 #${oc.env:RANDOM_SEED,12345}
+  episode_timeout_seconds: 120 # Timeout for the main process waiting for a worker episode result
 
   # --- New settings for N x M Evaluation ---
   evaluate_multiple_interestingness: True
@@ -54,13 +54,13 @@ policy:
   type: interestingness_guided
   params:
     concept_selection: INTERESTINGNESS
-    action_selection: RULE_BASED_RANDOM # TODO(George; 4/7): Change this back to SIMULATE_AND_SCORE when that implementation has been verified.
-    top_k_concepts: 5
+    action_selection: SIMULATE_AND_SCORE # TODO(George; 4/7): Change this back to SIMULATE_AND_SCORE when that implementation has been verified.
+    top_k_concepts: 8
     temperature: 1.0
     generate_interestingness: True
     interestingness_function_path: ""
     action_selection_params:
-      simulation_limit: 20
+      simulation_limit: 3
 
 # Production Rules configuration (Explicitly added for manual loading via OmegaConf.load)
 production_rules:

diff --git a/frame/configs/succ_zero_generated_interestingness_experiment.yaml b/frame/configs/succ_zero_generated_interestingness_experiment.yaml
@@ -29,7 +29,7 @@ policy:
   params:
     concept_selection: INTERESTINGNESS
     action_selection: SIMULATE_AND_SCORE # RULE_BASED_RANDOM # TODO(George; 4/7): Change this back to SIMULATE_AND_SCORE when that implementation has been verified.
-    top_k_concepts: 5
+    top_k_concepts: 8
     temperature: 0.5
     generate_interestingness: True
     interestingness_function_path: ""

diff --git a/frame/configs/succ_zero_standard_experiment.yaml b/frame/configs/succ_zero_standard_experiment.yaml
@@ -20,7 +20,7 @@ policy:
   type: interestingness_guided
   params:
     concept_selection: INTERESTINGNESS
-    top_k_concepts: 5
+    top_k_concepts: 8
     temperature: 0.5
     simulation_limit: 3
     action_selection: SIMULATE_AND_SCORE # RULE_BASED_RANDOM

diff --git a/frame/configs/theory_building/initial_state/arithmetic_basic.yaml b/frame/configs/theory_building/initial_state/arithmetic_basic.yaml
@@ -9,6 +9,7 @@ import_from: "frame.knowledge_base.initial_concepts"
 concepts:
   - "create_zero_concept"
   - "create_one_concept"
+  - "create_two_concept"
   - "create_addition_concept"
   - "create_multiplication_concept"
   - "create_divides_concept"

diff --git a/frame/funsearch/implementation/abstraction_library.py b/frame/funsearch/implementation/abstraction_library.py
@@ -2,6 +2,8 @@
 import dataclasses
 from typing import Optional, List, Dict
 import logging
+import textwrap
+import random # Add random import
 
 logger = logging.getLogger(__name__)
 
@@ -52,11 +54,32 @@ def format_for_sampler_prompt(self) -> str:
             return "  (No abstraction functions available in this island yet.)"
 
         prompt_lines = ["**Available Abstraction Functions:**"]
-        # Sort alphabetically for consistent prompt appearance
-        sorted_abstractions = sorted(self._abstractions.values(), key=lambda x: x.name)
-        for abs_obj in sorted_abstractions:
-            # Format consistently with DSL primitives if possible
-            prompt_lines.append(f"- `{abs_obj.name}{abs_obj.signature}`: {abs_obj.description}")
+
+        abstractions_list = list(self._abstractions.values()) # Get a list of abstractions
+        num_abstractions = len(abstractions_list)
+
+        indices_to_show_code = []
+        if num_abstractions > 0:
+            # Determine how many to pick (up to 2)
+            k = min(num_abstractions, 2)
+            indices_to_show_code = random.sample(range(num_abstractions), k)
+
+        # Sort alphabetically for consistent prompt appearance (after random selection)
+        # We sort the original list, then iterate with original indices
+        sorted_abstractions_with_original_indices = sorted(
+            [(i, abs_obj) for i, abs_obj in enumerate(abstractions_list)], 
+            key=lambda x: x[1].name
+        )
+
+        for original_idx, abs_obj in sorted_abstractions_with_original_indices:
+            base_info = f"- `{abs_obj.name}{abs_obj.signature}`: {abs_obj.description}"
+            if original_idx in indices_to_show_code:
+                prompt_lines.append(base_info)
+                prompt_lines.append("  ```python")
+                prompt_lines.append(textwrap.indent(abs_obj.code, '  '))
+                prompt_lines.append("  ```")
+            else:
+                prompt_lines.append(base_info)
         return "\n".join(prompt_lines)
 
     def generate_definitions_for_sandbox(self) -> str:

diff --git a/frame/funsearch/main.py b/frame/funsearch/main.py
@@ -133,7 +133,7 @@ def __init__(self, cfg: DictConfig):
         # --- Safeguard for excessive parallelism ---
         # Access TheoryBuilder's num_workers from the base config loaded by the sandbox
         tb_base_cfg = self.sandbox.base_cfg
-        tb_num_workers = omegaconf.OmegaConf.select(tb_base_cfg, 'experiment.num_workers', default=1)
+        tb_num_workers = omegaconf.OmegaConf.select(tb_base_cfg, 'experiment.num_workers', default=64)
 
         # Access num_islands from FunSearch config (after database setup)
         num_islands = self.cfg.programs_database.num_islands
@@ -145,7 +145,7 @@ def __init__(self, cfg: DictConfig):
                     f"FunSearch programs_database.num_islands ({num_islands}) = {total_potential_tb_workers}")
 
         # The limit is < 64, so >= 64 should raise an error.
-        WORKER_LIMIT = 64 
+        WORKER_LIMIT = 256 
         if total_potential_tb_workers > WORKER_LIMIT:
             error_msg = (
                 f"Potential concurrent TheoryBuilder worker count ({total_potential_tb_workers}) meets or exceeds limit of {WORKER_LIMIT}. "

diff --git a/frame/interestingness/learning/algorithms/one_shot_llm.py b/frame/interestingness/learning/algorithms/one_shot_llm.py
@@ -129,9 +129,9 @@ def _load_model_config(self, config_path: str) -> Any:
             # Create a default config
             class DefaultConfig:
                 def __init__(self):
-                    self.name = "gpt-4o-mini"
-                    self.temperature = 0.7
-                    self.max_tokens = 4000
+                    self.name = "gpt-4o"
+                    self.temperature = 1.0
+                    self.max_tokens = 4096
                     self.system_prompt = "You are a mathematical assistant that specializes in creating interestingness functions."
 
             return DefaultConfig()
@@ -711,8 +711,8 @@ async def main():
 
     # Create a generator
     generator = OneShotLLMGenerator(
-        model="gpt-4o-mini",
-        temperature=0.7,
+        model="gpt-4o",
+        temperature=1.0,
         logger=logger
     )
 

diff --git a/...learning/baselines/hr_comprehensbility.py → ...earning/baselines/hr_comprehensibility.py b/...learning/baselines/hr_comprehensbility.py → ...earning/baselines/hr_comprehensibility.py