apistemic · lorey · Sep 7, 2025
diff --git a/apistemic/benchmarks/datasets/companies.py b/apistemic/benchmarks/datasets/companies.py
@@ -1,23 +1,87 @@
+import logging
+
 import pandas as pd
 from sqlalchemy import MetaData
 from sqlalchemy import Table
 from sqlalchemy.orm import Session
 
 from apistemic.benchmarks.datasets.util import get_db_engine
 
+# Module-level cache for company data
+_company_cache = {}
+
+
+def clear_company_cache():
+    """Clear the in-memory company cache."""
+    global _company_cache
+    _company_cache.clear()
+    logging.info("Company cache cleared")
+
+
+def get_company_cache_stats():
+    """Get statistics about the current cache state."""
+    return {
+        "cached_companies": len(_company_cache),
+        "cache_size_mb": sum(len(str(v)) for v in _company_cache.values())
+        / (1024 * 1024),
+    }
+
 
 def fetch_companies_df(company_ids):
+    """Fetch company data with in-memory caching."""
     company_ids = list(map(int, company_ids))  # Ensure IDs are integers
 
-    engine = get_db_engine()
+    # Check cache for existing companies
+    cached_companies = []
+    missing_ids = []
+
+    for company_id in company_ids:
+        if company_id in _company_cache:
+            cached_companies.append(_company_cache[company_id])
+        else:
+            missing_ids.append(company_id)
 
-    # create sqlalchemy query
+    # If all companies are cached, return them
+    if not missing_ids:
+        logging.debug(f"All {len(company_ids)} companies found in cache")
+        return pd.DataFrame(cached_companies)
+
+    # Fetch missing companies from database
+    logging.debug(
+        f"Fetching {len(missing_ids)} missing companies from database "
+        f"(cache has {len(cached_companies)})"
+    )
+
+    engine = get_db_engine()
     metadata = MetaData()
     organizations_table = Table("organizations", metadata, autoload_with=engine)
 
     with Session(engine) as session:
         query = session.query(organizations_table).filter(
-            organizations_table.c.id.in_(company_ids)
+            organizations_table.c.id.in_(missing_ids)
         )
-        df = pd.read_sql_query(query.statement, con=engine)
-        return df
+        df_missing = pd.read_sql_query(query.statement, con=engine)
+
+    # Add newly fetched companies to cache
+    for _, row in df_missing.iterrows():
+        _company_cache[int(row["id"])] = row.to_dict()
+
+    logging.info(
+        f"Added {len(df_missing)} companies to cache. "
+        f"Cache now has {len(_company_cache)} companies"
+    )
+
+    # Combine cached and newly fetched companies
+    all_companies = cached_companies + [
+        row.to_dict() for _, row in df_missing.iterrows()
+    ]
+
+    # Return in original order
+    company_dict = {comp["id"]: comp for comp in all_companies}
+    ordered_companies = [
+        company_dict[company_id]
+        for company_id in company_ids
+        if company_id in company_dict
+    ]
+
+    return pd.DataFrame(ordered_companies)
diff --git a/apistemic/benchmarks/plots.py b/apistemic/benchmarks/plots.py
@@ -184,3 +184,53 @@ def add_watermark() -> None:
         va="bottom",
         color="gray",
     )
+
+
+def create_profile_embedding_box_plot(
+    all_results: dict[str, list[EvaluationMetrics]],
+    figsize: tuple[float, float] = DEFAULT_FIGSIZE,
+) -> None:
+    """Create box plot of R² scores for profile embeddings benchmark."""
+    # Sort models by median R² score in ascending order (lowest bottom, highest top)
+    models = sorted(
+        all_results.keys(),
+        key=lambda x: np.median([metrics.r2 for metrics in all_results[x]]),
+    )
+    r2_scores = []
+
+    for model in models:
+        model_r2_scores = [metrics.r2 for metrics in all_results[model]]
+        r2_scores.append(model_r2_scores)
+
+    plt.style.use("grayscale")
+    plt.figure(figsize=figsize)
+    plt.tight_layout()
+    plt.boxplot(r2_scores, tick_labels=models, patch_artist=False, vert=False)
+
+    today = get_date_str()
+    plt.suptitle(f"LLM Profile Generation & Embedding Similarity ({today})")
+    plt.xlabel("R² Score")
+    plt.ylabel("LLM + Embedder Combination")
+    plt.grid(True, alpha=0.3, axis="x")
+    plt.yticks(rotation=0)
+
+    # Add watermark
+    add_watermark()
+
+    plt.tight_layout()
+
+    # Save the plot with distinct filename
+    plt.savefig(
+        ".data/plots/profile-embedding-r2-boxplot.png", dpi=300, bbox_inches="tight"
+    )
+
+    # Print summary statistics
+    print("\n" + "=" * 60)
+    print("PROFILE EMBEDDING SUMMARY STATISTICS")
+    print("=" * 60)
+    for model, model_r2_scores in zip(models, r2_scores):
+        print(f"\n{model}:")
+        print(f"  Mean R²: {np.mean(model_r2_scores):.4f}")
+        print(f"  Std R²:  {np.std(model_r2_scores):.4f}")
+        print(f"  Min R²:  {np.min(model_r2_scores):.4f}")
+        print(f"  Max R²:  {np.max(model_r2_scores):.4f}")
diff --git a/apistemic/benchmarks/transformers.py b/apistemic/benchmarks/transformers.py
@@ -1,13 +1,157 @@
+import hashlib
+import pickle
+from concurrent.futures import ThreadPoolExecutor
+from urllib.parse import quote_plus
+
+import numpy as np
 import pandas as pd
 from langchain.embeddings import CacheBackedEmbeddings
 from langchain.storage import LocalFileStore
 from langchain_core.embeddings.embeddings import Embeddings
+from langchain_openai import ChatOpenAI
+from minimalkv.fs import FilesystemStore
+from pydantic import BaseModel
+from pydantic import Field
 from sklearn.base import BaseEstimator
 from sklearn.base import TransformerMixin
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import tqdm
 
 from apistemic.benchmarks.datasets.companies import fetch_companies_df
 
 
+class CompanyProfile(BaseModel):
+    summary: str = Field(
+        description="One-sentence tagline of the company, e.g. 'app-enabled car "
+        "hauling unicorn', 'company data provider', 'AI-powered leadgen agents."
+    )
+    business_model: str = Field(description="Business model of the company")
+    ideal_customer_profile: str = Field(description="Ideal customer profile")
+    usp: str = Field(
+        description="Unique selling proposition of the company against competitors"
+    )
+
+
+class CompanyProfileTransformer(BaseEstimator, TransformerMixin):
+    """Transformer to generate company profiles using an LLM."""
+
+    def __init__(self, llm=None):
+        self.llm = llm if llm else ChatOpenAI(model="gpt-5-nano")
+        self.structured_llm = None
+        self.store = FilesystemStore(".cache")
+        # Create a unique identifier for this LLM
+        self.llm_identifier = self._get_llm_identifier()
+
+    def _get_llm_identifier(self):
+        """Create a unique identifier for the LLM based on its properties."""
+        # Try to get model name from different LLM types
+        if hasattr(self.llm, "model"):
+            return self.llm.model
+        elif hasattr(self.llm, "model_name"):
+            return self.llm.model_name
+        else:
+            # Fallback to hash of string representation
+            return hashlib.md5(str(self.llm).encode()).hexdigest()[:8]
+
+    def fit(self, X, y=None):
+        """Initialize the structured LLM."""
+        self.structured_llm = self.llm.with_structured_output(CompanyProfile)
+
+        # Set fitted attributes for sklearn's check_is_fitted
+        self.n_features_in_ = len(X) if hasattr(X, "__len__") else 1
+        return self
+
+    def transform(self, X):
+        """Generate company profiles for each company."""
+        if not isinstance(X, pd.DataFrame):
+            raise ValueError("Input must be pd.DataFrame")
+
+        df = X.copy()
+
+        def generate_profile_with_cache(company_name):
+            """Generate profile with caching based on company name and LLM."""
+            cache_key = f"profile_{company_name}_{self.llm_identifier}"
+            cache_key = quote_plus(cache_key, safe="")
+
+            try:
+                cached_result = self.store.get(cache_key)
+                return pickle.loads(cached_result)
+            except KeyError:
+                # Not in cache, generate profile
+                prompt = (
+                    "You're an expert company analyst."
+                    "You will get a company name from me."
+                    "Your task it then to create a company profile with the "
+                    "given fields."
+                    f"The company name is: {company_name}."
+                    "Don't mention the company name in any of the answers."
+                )
+                result = self.structured_llm.invoke(prompt)
+                profile_dict = result.model_dump()
+
+                # Cache the result
+                self.store.put(cache_key, pickle.dumps(profile_dict))
+                return profile_dict
+
+        # Generate profiles for each company
+        profiles = []
+        with ThreadPoolExecutor(max_workers=16) as tpe:
+            list(
+                tqdm(
+                    tpe.map(generate_profile_with_cache, df["name"].unique().tolist()),
+                    total=len(df),
+                    desc=f"Generating company profiles ({self.llm_identifier})",
+                )
+            )
+
+        # Convert profiles to DataFrame and merge with original
+        profiles = [generate_profile_with_cache(name) for name in df["name"].values]
+        profiles_df = pd.DataFrame(profiles)
+        for col in profiles_df.columns:
+            df[f"profile_{col}"] = profiles_df[col].values
+
+        return df
+
+
+class CompanyProfileEmbeddingTransformer(BaseEstimator, TransformerMixin):
+    """Transformer to embed company profile attributes."""
+
+    def __init__(self, embedder: Embeddings):
+        self.embedder = embedder
+
+    def fit(self, X, y=None):
+        """Fit the transformer."""
+        self.n_features_in_ = (
+            X.shape[1]
+            if hasattr(X, "shape")
+            else len(X.columns)
+            if hasattr(X, "columns")
+            else 1
+        )
+        return self
+
+    def transform(self, X):
+        """Embed profile attributes."""
+        if not isinstance(X, pd.DataFrame):
+            raise ValueError("Input must be pd.DataFrame")
+
+        df = X.copy()
+
+        # Cache embeddings
+        store = LocalFileStore("./cache/")
+        cached_embedder = CacheBackedEmbeddings.from_bytes_store(
+            self.embedder, store, namespace=self.embedder.model, key_encoder="sha256"
+        )
+
+        # Embed each profile attribute
+        profile_cols = [col for col in df.columns if col.startswith("profile_")]
+        for col in profile_cols:
+            embedding_col = f"embedding_{col}"
+            df[embedding_col] = cached_embedder.embed_documents(df[col].values.tolist())
+
+        return df
+
+
 class LoadOrganizationTransformer(BaseEstimator, TransformerMixin):
     def fit(self, X, y=None):
         """
@@ -151,3 +295,66 @@ def transform(self, X):
         diff_cols = [f"embedding_diff_{i}" for i in range(len(embedding_diff.columns))]
         embedding_diff.columns = diff_cols
         return embedding_diff
+
+
+class ProfileEmbeddingCosineSimilarityTransformer(BaseEstimator, TransformerMixin):
+    """Compute cosine similarity between profile embeddings of company pairs."""
+
+    def fit(self, X, y=None):
+        """Fit the transformer."""
+        self.n_features_in_ = (
+            X.shape[1]
+            if hasattr(X, "shape")
+            else len(X.columns)
+            if hasattr(X, "columns")
+            else 1
+        )
+        return self
+
+    def transform(self, X):
+        """Compute cosine similarities for each profile attribute."""
+        df = X.copy()
+
+        # Find all embedding columns
+        from_embedding_cols = [
+            col for col in df.columns if col.startswith("from_embedding_profile_")
+        ]
+        to_embedding_cols = [
+            col for col in df.columns if col.startswith("to_embedding_profile_")
+        ]
+
+        if not from_embedding_cols or not to_embedding_cols:
+            raise ValueError(
+                "No profile embedding columns found in the input DataFrame"
+            )
+
+        # Compute cosine similarity for each profile attribute
+        similarity_features = []
+
+        for from_col in from_embedding_cols:
+            # Extract attribute name (e.g., "business_model" from
+            # "from_embedding_profile_business_model")
+            attribute = from_col.replace("from_embedding_profile_", "")
+            to_col = f"to_embedding_profile_{attribute}"
+
+            if to_col in to_embedding_cols:
+                # Compute cosine similarity for this attribute pair
+                similarities = []
+                for idx, row in df.iterrows():
+                    from_emb = np.array(row[from_col]).reshape(1, -1)
+                    to_emb = np.array(row[to_col]).reshape(1, -1)
+                    sim = cosine_similarity(from_emb, to_emb)[0, 0]
+                    similarities.append(sim)
+
+                # Add as a feature column
+                similarity_features.append(
+                    pd.Series(
+                        similarities, name=f"cosine_sim_{attribute}", index=df.index
+                    )
+                )
+
+        # Return DataFrame with all similarity features
+        if similarity_features:
+            return pd.concat(similarity_features, axis=1)
+        else:
+            raise ValueError("No matching profile embedding pairs found")