Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 18 additions & 0 deletions apistemic/benchmarks/datasets/recommendations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from apistemic.benchmarks.datasets.util import get_db_engine


def fetch_recommendations() -> pd.DataFrame:
sql = (
"SELECT *"
" FROM domainlistentries dle"
" JOIN domains d ON dle.domain_id = d.id"
# recent ratings
" WHERE dle.created_at > NOW() - '1 year'::interval"
# remove non-user entries
" AND dle.user_id IS NOT NULL"
)
engine = get_db_engine()
df = pd.read_sql_query(sql, con=engine)
return df
47 changes: 47 additions & 0 deletions apistemic/benchmarks/llms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os

from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings


def get_embedding_llms():
"""Get configured embedding models."""
return [
GoogleGenerativeAIEmbeddings(
model="gemini-embedding-001", google_api_key=os.environ["GEMINI_API_KEY"]
),
OpenAIEmbeddings(model="text-embedding-ada-002"),
OpenAIEmbeddings(model="text-embedding-3-small"),
OpenAIEmbeddings(model="text-embedding-3-large"),
]


def get_chat_llms_by_key():
"""Get configured chat models."""
google_api_key = os.environ["GEMINI_API_KEY"]
return {
"google__gemini-2.5-flash-lite": ChatGoogleGenerativeAI(
model="gemini-2.5-flash-lite", google_api_key=google_api_key
),
"google__gemini-2.5-flash": ChatGoogleGenerativeAI(
model="gemini-2.5-flash", google_api_key=google_api_key
),
"google__gemini-2.5-pro": ChatGoogleGenerativeAI(
model="gemini-2.5-pro", google_api_key=google_api_key
),
"anthropic__claude-opus-4-1": ChatAnthropic(
model="claude-opus-4-1-20250805", timeout=30
),
"anthropic__claude-sonnet-4": ChatAnthropic(
model="claude-sonnet-4-20250514", timeout=30
),
"anthropic__claude-3-5-haiku": ChatAnthropic(
model="claude-3-5-haiku-20241022", timeout=30
),
"openai__gpt-5": ChatOpenAI(model="gpt-5"),
"openai__gpt-5-mini": ChatOpenAI(model="gpt-5-mini"),
"openai__gpt-5-nano": ChatOpenAI(model="gpt-5-nano"),
}
56 changes: 56 additions & 0 deletions apistemic/benchmarks/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,62 @@ def get_date_str() -> str:
return date.today().strftime("%B %Y")


def create_recommendations_box_plot(
all_results: dict[str, list[EvaluationMetrics]],
figsize: tuple[float, float] = DEFAULT_FIGSIZE,
) -> None:
"""Create box plot of R² scores by embedding model for recommendations."""
# Sort models by median R² score in ascending order (lowest bottom, highest top)
models = sorted(
all_results.keys(),
key=lambda x: np.median([metrics.r2 for metrics in all_results[x]]),
)
r2_scores = []

for model in models:
model_r2_scores = [metrics.r2 for metrics in all_results[model]]
r2_scores.append(model_r2_scores)

plt.style.use("grayscale")
plt.figure(figsize=figsize)
plt.tight_layout()
plt.boxplot(r2_scores, tick_labels=models, patch_artist=False, vert=False)

today = get_date_str()
plt.suptitle(
"LLM Domain Knowledge:"
" Can Domain Name Embeddings Predict User Preference?"
f" ({today})"
)
plt.xlabel("R² Score")
plt.ylabel("Embedding Model")
plt.grid(True, alpha=0.3, axis="x")
plt.yticks(rotation=0)

# Add watermark
add_watermark()

plt.tight_layout()

# Save the plot
plt.savefig(
".data/plots/recommendations-r2-scores-boxplot.png",
dpi=300,
bbox_inches="tight",
)

# Print summary statistics
print("\n" + "=" * 60)
print("RECOMMENDATIONS SUMMARY STATISTICS")
print("=" * 60)
for model, model_r2_scores in zip(models, r2_scores):
print(f"\n{model}:")
print(f" Mean R²: {np.mean(model_r2_scores):.4f}")
print(f" Std R²: {np.std(model_r2_scores):.4f}")
print(f" Min R²: {np.min(model_r2_scores):.4f}")
print(f" Max R²: {np.max(model_r2_scores):.4f}")


def add_watermark() -> None:
"""Add Apistemic watermark to current plot."""
plt.text(
Expand Down
82 changes: 82 additions & 0 deletions apistemic/benchmarks/transformers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
Expand Down Expand Up @@ -81,6 +82,51 @@ def transform(self, X):
return df


class DomainEmbeddingTransformer(BaseEstimator, TransformerMixin):
def __init__(self, embedder: Embeddings):
self.embedder = embedder

def fit(self, X, y=None):
"""
Fit the transformer to the data.
"""
# Set fitted attributes for sklearn's check_is_fitted
self.n_features_in_ = (
X.shape[1]
if hasattr(X, "shape")
else len(X.columns)
if hasattr(X, "columns")
else 1
)
return self

def transform(self, X):
"""
Transform the input data by engineering features.
"""
if not isinstance(X, pd.DataFrame):
raise ValueError("Input must be pd.DataFrame")

# use df for convenience
df = X

store = LocalFileStore("./cache/")
cached_embedder = CacheBackedEmbeddings.from_bytes_store(
self.embedder, store, namespace=self.embedder.model, key_encoder="sha256"
)

return cached_embedder.embed_documents(df["domain"].values.tolist())


class DomainEmbeddingExtractorTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self

def transform(self, X):
"""Extract embedding vectors as numpy array for sklearn"""
return np.array(X["embedding_domain"].tolist())


class CompanyTupleTransformer(BaseEstimator, TransformerMixin):
# company pipeline turns ids to features
company_pipeline: TransformerMixin
Expand Down Expand Up @@ -151,3 +197,39 @@ def transform(self, X):
diff_cols = [f"embedding_diff_{i}" for i in range(len(embedding_diff.columns))]
embedding_diff.columns = diff_cols
return embedding_diff


class OneHotEncodingTransformer(BaseEstimator, TransformerMixin):
"""Create one-hot encoded features from a specific column."""

def __init__(self, column_name):
self.column_name = column_name
from sklearn.preprocessing import OneHotEncoder

self.encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
raise ValueError("Input must be pd.DataFrame")

if self.column_name not in X.columns:
raise ValueError(f"Column '{self.column_name}' not found in DataFrame")

# Fit the encoder on the specified column
self.encoder.fit(X[[self.column_name]])

# Set fitted attributes for sklearn's check_is_fitted
self.n_features_in_ = X.shape[1] if hasattr(X, "shape") else 1
return self

def transform(self, X):
if not isinstance(X, pd.DataFrame):
raise ValueError("Input must be pd.DataFrame")

if self.column_name not in X.columns:
raise ValueError(f"Column '{self.column_name}' not found in DataFrame")

# Transform the specified column to one-hot encoded features
encoded = self.encoder.transform(X[[self.column_name]])

return encoded
Loading