Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 69 additions & 5 deletions apistemic/benchmarks/datasets/companies.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,87 @@
import logging

import pandas as pd
from sqlalchemy import MetaData
from sqlalchemy import Table
from sqlalchemy.orm import Session

from apistemic.benchmarks.datasets.util import get_db_engine

# Module-level cache for company data
_company_cache = {}


def clear_company_cache():
"""Clear the in-memory company cache."""
global _company_cache
_company_cache.clear()
logging.info("Company cache cleared")


def get_company_cache_stats():
"""Get statistics about the current cache state."""
return {
"cached_companies": len(_company_cache),
"cache_size_mb": sum(len(str(v)) for v in _company_cache.values())
/ (1024 * 1024),
}


def fetch_companies_df(company_ids):
"""Fetch company data with in-memory caching."""
company_ids = list(map(int, company_ids)) # Ensure IDs are integers

engine = get_db_engine()
# Check cache for existing companies
cached_companies = []
missing_ids = []

for company_id in company_ids:
if company_id in _company_cache:
cached_companies.append(_company_cache[company_id])
else:
missing_ids.append(company_id)

# create sqlalchemy query
# If all companies are cached, return them
if not missing_ids:
logging.debug(f"All {len(company_ids)} companies found in cache")
return pd.DataFrame(cached_companies)

# Fetch missing companies from database
logging.debug(
f"Fetching {len(missing_ids)} missing companies from database "
f"(cache has {len(cached_companies)})"
)

engine = get_db_engine()
metadata = MetaData()
organizations_table = Table("organizations", metadata, autoload_with=engine)

with Session(engine) as session:
query = session.query(organizations_table).filter(
organizations_table.c.id.in_(company_ids)
organizations_table.c.id.in_(missing_ids)
)
df = pd.read_sql_query(query.statement, con=engine)
return df
df_missing = pd.read_sql_query(query.statement, con=engine)

# Add newly fetched companies to cache
for _, row in df_missing.iterrows():
_company_cache[int(row["id"])] = row.to_dict()

logging.info(
f"Added {len(df_missing)} companies to cache. "
f"Cache now has {len(_company_cache)} companies"
)

# Combine cached and newly fetched companies
all_companies = cached_companies + [
row.to_dict() for _, row in df_missing.iterrows()
]

# Return in original order
company_dict = {comp["id"]: comp for comp in all_companies}
ordered_companies = [
company_dict[company_id]
for company_id in company_ids
if company_id in company_dict
]

return pd.DataFrame(ordered_companies)
50 changes: 50 additions & 0 deletions apistemic/benchmarks/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,53 @@ def add_watermark() -> None:
va="bottom",
color="gray",
)


def create_profile_embedding_box_plot(
all_results: dict[str, list[EvaluationMetrics]],
figsize: tuple[float, float] = DEFAULT_FIGSIZE,
) -> None:
"""Create box plot of R² scores for profile embeddings benchmark."""
# Sort models by median R² score in ascending order (lowest bottom, highest top)
models = sorted(
all_results.keys(),
key=lambda x: np.median([metrics.r2 for metrics in all_results[x]]),
)
r2_scores = []

for model in models:
model_r2_scores = [metrics.r2 for metrics in all_results[model]]
r2_scores.append(model_r2_scores)

plt.style.use("grayscale")
plt.figure(figsize=figsize)
plt.tight_layout()
plt.boxplot(r2_scores, tick_labels=models, patch_artist=False, vert=False)

today = get_date_str()
plt.suptitle(f"LLM Profile Generation & Embedding Similarity ({today})")
plt.xlabel("R² Score")
plt.ylabel("LLM + Embedder Combination")
plt.grid(True, alpha=0.3, axis="x")
plt.yticks(rotation=0)

# Add watermark
add_watermark()

plt.tight_layout()

# Save the plot with distinct filename
plt.savefig(
".data/plots/profile-embedding-r2-boxplot.png", dpi=300, bbox_inches="tight"
)

# Print summary statistics
print("\n" + "=" * 60)
print("PROFILE EMBEDDING SUMMARY STATISTICS")
print("=" * 60)
for model, model_r2_scores in zip(models, r2_scores):
print(f"\n{model}:")
print(f" Mean R²: {np.mean(model_r2_scores):.4f}")
print(f" Std R²: {np.std(model_r2_scores):.4f}")
print(f" Min R²: {np.min(model_r2_scores):.4f}")
print(f" Max R²: {np.max(model_r2_scores):.4f}")
207 changes: 207 additions & 0 deletions apistemic/benchmarks/transformers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,157 @@
import hashlib
import pickle
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import quote_plus

import numpy as np
import pandas as pd
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain_core.embeddings.embeddings import Embeddings
from langchain_openai import ChatOpenAI
from minimalkv.fs import FilesystemStore
from pydantic import BaseModel
from pydantic import Field
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

from apistemic.benchmarks.datasets.companies import fetch_companies_df


class CompanyProfile(BaseModel):
summary: str = Field(
description="One-sentence tagline of the company, e.g. 'app-enabled car "
"hauling unicorn', 'company data provider', 'AI-powered leadgen agents."
)
business_model: str = Field(description="Business model of the company")
ideal_customer_profile: str = Field(description="Ideal customer profile")
usp: str = Field(
description="Unique selling proposition of the company against competitors"
)


class CompanyProfileTransformer(BaseEstimator, TransformerMixin):
"""Transformer to generate company profiles using an LLM."""

def __init__(self, llm=None):
self.llm = llm if llm else ChatOpenAI(model="gpt-5-nano")
self.structured_llm = None
self.store = FilesystemStore(".cache")
# Create a unique identifier for this LLM
self.llm_identifier = self._get_llm_identifier()

def _get_llm_identifier(self):
"""Create a unique identifier for the LLM based on its properties."""
# Try to get model name from different LLM types
if hasattr(self.llm, "model"):
return self.llm.model
elif hasattr(self.llm, "model_name"):
return self.llm.model_name
else:
# Fallback to hash of string representation
return hashlib.md5(str(self.llm).encode()).hexdigest()[:8]

def fit(self, X, y=None):
"""Initialize the structured LLM."""
self.structured_llm = self.llm.with_structured_output(CompanyProfile)

# Set fitted attributes for sklearn's check_is_fitted
self.n_features_in_ = len(X) if hasattr(X, "__len__") else 1
return self

def transform(self, X):
"""Generate company profiles for each company."""
if not isinstance(X, pd.DataFrame):
raise ValueError("Input must be pd.DataFrame")

df = X.copy()

def generate_profile_with_cache(company_name):
"""Generate profile with caching based on company name and LLM."""
cache_key = f"profile_{company_name}_{self.llm_identifier}"
cache_key = quote_plus(cache_key, safe="")

try:
cached_result = self.store.get(cache_key)
return pickle.loads(cached_result)
except KeyError:
# Not in cache, generate profile
prompt = (
"You're an expert company analyst."
"You will get a company name from me."
"Your task it then to create a company profile with the "
"given fields."
f"The company name is: {company_name}."
"Don't mention the company name in any of the answers."
)
result = self.structured_llm.invoke(prompt)
profile_dict = result.model_dump()

# Cache the result
self.store.put(cache_key, pickle.dumps(profile_dict))
return profile_dict

# Generate profiles for each company
profiles = []
with ThreadPoolExecutor(max_workers=16) as tpe:
list(
tqdm(
tpe.map(generate_profile_with_cache, df["name"].unique().tolist()),
total=len(df),
desc=f"Generating company profiles ({self.llm_identifier})",
)
)

# Convert profiles to DataFrame and merge with original
profiles = [generate_profile_with_cache(name) for name in df["name"].values]
profiles_df = pd.DataFrame(profiles)
for col in profiles_df.columns:
df[f"profile_{col}"] = profiles_df[col].values

return df


class CompanyProfileEmbeddingTransformer(BaseEstimator, TransformerMixin):
"""Transformer to embed company profile attributes."""

def __init__(self, embedder: Embeddings):
self.embedder = embedder

def fit(self, X, y=None):
"""Fit the transformer."""
self.n_features_in_ = (
X.shape[1]
if hasattr(X, "shape")
else len(X.columns)
if hasattr(X, "columns")
else 1
)
return self

def transform(self, X):
"""Embed profile attributes."""
if not isinstance(X, pd.DataFrame):
raise ValueError("Input must be pd.DataFrame")

df = X.copy()

# Cache embeddings
store = LocalFileStore("./cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
self.embedder, store, namespace=self.embedder.model, key_encoder="sha256"
)

# Embed each profile attribute
profile_cols = [col for col in df.columns if col.startswith("profile_")]
for col in profile_cols:
embedding_col = f"embedding_{col}"
df[embedding_col] = cached_embedder.embed_documents(df[col].values.tolist())

return df


class LoadOrganizationTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
"""
Expand Down Expand Up @@ -151,3 +295,66 @@ def transform(self, X):
diff_cols = [f"embedding_diff_{i}" for i in range(len(embedding_diff.columns))]
embedding_diff.columns = diff_cols
return embedding_diff


class ProfileEmbeddingCosineSimilarityTransformer(BaseEstimator, TransformerMixin):
"""Compute cosine similarity between profile embeddings of company pairs."""

def fit(self, X, y=None):
"""Fit the transformer."""
self.n_features_in_ = (
X.shape[1]
if hasattr(X, "shape")
else len(X.columns)
if hasattr(X, "columns")
else 1
)
return self

def transform(self, X):
"""Compute cosine similarities for each profile attribute."""
df = X.copy()

# Find all embedding columns
from_embedding_cols = [
col for col in df.columns if col.startswith("from_embedding_profile_")
]
to_embedding_cols = [
col for col in df.columns if col.startswith("to_embedding_profile_")
]

if not from_embedding_cols or not to_embedding_cols:
raise ValueError(
"No profile embedding columns found in the input DataFrame"
)

# Compute cosine similarity for each profile attribute
similarity_features = []

for from_col in from_embedding_cols:
# Extract attribute name (e.g., "business_model" from
# "from_embedding_profile_business_model")
attribute = from_col.replace("from_embedding_profile_", "")
to_col = f"to_embedding_profile_{attribute}"

if to_col in to_embedding_cols:
# Compute cosine similarity for this attribute pair
similarities = []
for idx, row in df.iterrows():
from_emb = np.array(row[from_col]).reshape(1, -1)
to_emb = np.array(row[to_col]).reshape(1, -1)
sim = cosine_similarity(from_emb, to_emb)[0, 0]
similarities.append(sim)

# Add as a feature column
similarity_features.append(
pd.Series(
similarities, name=f"cosine_sim_{attribute}", index=df.index
)
)

# Return DataFrame with all similarity features
if similarity_features:
return pd.concat(similarity_features, axis=1)
else:
raise ValueError("No matching profile embedding pairs found")
Loading