Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 13 additions & 9 deletions apps/autointerp/neuronpedia_autointerp/routes/explain/default.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,39 @@
import traceback

import torch
from delphi.clients import OpenRouter
from delphi.explainers import DefaultExplainer
from delphi.explainers.explainer import ExplainerResult
from delphi.latents.latents import Example, Latent, LatentRecord
from fastapi import HTTPException
from neuronpedia_autointerp_client.models.explain_default_post200_response import (
ExplainDefaultPost200Response,
)
from neuronpedia_autointerp_client.models.explain_default_post_request import (
ExplainDefaultPostRequest,
)
from sae_auto_interp.clients import OpenRouter
from sae_auto_interp.explainers import DefaultExplainer
from sae_auto_interp.explainers.explainer import ExplainerResult
from sae_auto_interp.features import Example, Feature, FeatureRecord


async def explain_default(request: ExplainDefaultPostRequest):
"""
Generate an explanation for a given set of activations.
"""
try:
feature = Feature("feature", 0)
feature = Latent("feature", 0)
examples = []
for activation in request.activations:
example = Example(activation.tokens, torch.tensor(activation.values)) # type: ignore
example = Example(
tokens=activation.tokens, # type: ignore
activations=torch.tensor(activation.values),
str_tokens=activation.tokens,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not seem correct. We are expecting str_tokens to be the decoded tokens into strings. Do you have access to those in your request?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

activations.tokens is a list of strings. I think what's confusing is that we're able to pass activations.tokens to Example.tokens despite https://github.com/EleutherAI/delphi/blob/db49cb78120c1926a4a3c4928c76ece6be64dcb3/delphi/latents/latents.py#L66-L73. But https://github.com/EleutherAI/delphi/blob/db49cb78120c1926a4a3c4928c76ece6be64dcb3/delphi/scorers/embedding/embedding.py#L111-L121 shows that Example.tokens can either be a list of integers or strings.

Do we want to keep both Example.tokens and Example.str_tokens in delphi but make changes so tokens can only be a list of strings?

If so, I can update the requests in this repo, in a separate PR, and make the changes to delphi as well.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea this is definitely a mistake on our part that we let pass. Example.tokens should definitely only be a list of integers and Example.str_tokens the corresponding strings. In this case, because you don't care about the integers you can just pass a dummy list?

)
examples.append(example)
feature_record = FeatureRecord(feature)
feature_record = LatentRecord(feature)
feature_record.train = examples

client = OpenRouter(api_key=request.openrouter_key, model=request.model)
explainer = DefaultExplainer(client, tokenizer=None, threshold=0.6)
result: ExplainerResult = await explainer.__call__(feature_record) # type: ignore
explainer = DefaultExplainer(client, threshold=0.6, activations=False)
result: ExplainerResult = await explainer.__call__(feature_record)

return ExplainDefaultPost200Response(explanation=result.explanation)

Expand Down
42 changes: 27 additions & 15 deletions apps/autointerp/neuronpedia_autointerp/routes/score/embedding.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
import traceback

import torch
from fastapi import HTTPException
from neuronpedia_autointerp.utils import (
convert_embedding_output_to_score_embedding_output,
per_feature_scores_embedding,
from delphi.latents.latents import (
Example,
Latent,
LatentRecord,
)
from delphi.scorers import EmbeddingScorer
from delphi.scorers.scorer import ScorerResult
from fastapi import HTTPException
from neuronpedia_autointerp_client.models.score_embedding_post200_response import (
ScoreEmbeddingPost200Response,
)
from neuronpedia_autointerp_client.models.score_embedding_post_request import (
ScoreEmbeddingPostRequest,
)
from sae_auto_interp.features import Example, Feature, FeatureRecord
from sae_auto_interp.scorers import EmbeddingScorer
from sae_auto_interp.scorers.scorer import ScorerResult

from neuronpedia_autointerp.utils import (
convert_embedding_output_to_score_embedding_output,
per_feature_scores_embedding,
)


async def generate_score_embedding(request: ScoreEmbeddingPostRequest, model): # type: ignore
Expand All @@ -30,25 +35,32 @@ async def generate_score_embedding(request: ScoreEmbeddingPostRequest, model):
Returns a score based on embedding similarity and a detailed breakdown of the scoring.
"""
try:
feature = Feature("feature", 0)
feature = Latent("feature", 0)
activating_examples = []
non_activating_examples = []

for activation in request.activations:
example = Example(activation.tokens, torch.tensor(activation.values)) # type: ignore
example = Example(
tokens=activation.tokens, # type: ignore
activations=torch.tensor(activation.values),
)
if sum(activation.values) > 0:
activating_examples.append(example)
activating_examples.append(
[
example
] # TODO: remove brackets once https://github.com/EleutherAI/delphi/issues/132 is fixed
)
else:
non_activating_examples.append(example)

feature_record = FeatureRecord(feature)
feature_record.test = [activating_examples]
feature_record.extra_examples = non_activating_examples # type: ignore
feature_record.random_examples = non_activating_examples # type: ignore
feature_record = LatentRecord(feature)
feature_record.test = activating_examples
feature_record.not_active = non_activating_examples
feature_record.extra_examples = non_activating_examples
feature_record.explanation = request.explanation # type: ignore

scorer = EmbeddingScorer(model)
result: ScorerResult = await scorer.__call__(feature_record) # type: ignore
result: ScorerResult = await scorer.__call__(feature_record)
score = per_feature_scores_embedding(result.score)
breakdown = [
convert_embedding_output_to_score_embedding_output(item)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
import traceback

import torch
from delphi.clients import OpenRouter
from delphi.latents.latents import (
ActivatingExample,
Latent,
LatentRecord,
NonActivatingExample,
)
from delphi.scorers import DetectionScorer, FuzzingScorer
from delphi.scorers.scorer import ScorerResult
from fastapi import HTTPException
from neuronpedia_autointerp_client.models.np_score_fuzz_detection_type import (
NPScoreFuzzDetectionType,
Expand All @@ -11,10 +20,6 @@
from neuronpedia_autointerp_client.models.score_fuzz_detection_post_request import (
ScoreFuzzDetectionPostRequest,
)
from sae_auto_interp.clients import OpenRouter
from sae_auto_interp.features import Example, Feature, FeatureRecord
from sae_auto_interp.scorers import DetectionScorer, FuzzingScorer
from sae_auto_interp.scorers.scorer import ScorerResult

from neuronpedia_autointerp.utils import (
convert_classifier_output_to_score_classifier_output,
Expand All @@ -39,45 +44,54 @@ async def generate_score_fuzz_detection(request: ScoreFuzzDetectionPostRequest):
We currently show 5 examples at a time (batch_size=5).
"""
try:
feature = Feature("feature", 0)
feature = Latent("feature", 0)
activating_examples = []
non_activating_examples = []

for activation in request.activations:
example = Example(activation.tokens, torch.tensor(activation.values)) # type: ignore
if sum(activation.values) > 0:
example = ActivatingExample(
tokens=activation.tokens, # type: ignore
activations=torch.tensor(activation.values),
str_tokens=activation.tokens,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same thing here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment as to #113 (comment).

quantile=1,
)
activating_examples.append(example)
else:
example = NonActivatingExample(
tokens=activation.tokens, # type: ignore
activations=torch.tensor(activation.values),
str_tokens=activation.tokens,
distance=-1,
)
non_activating_examples.append(example)

feature_record = FeatureRecord(feature)
feature_record.test = [activating_examples]
feature_record.extra_examples = non_activating_examples # type: ignore
feature_record.random_examples = non_activating_examples # type: ignore
feature_record.explanation = request.explanation # type: ignore
feature_record = LatentRecord(feature)
feature_record.test = activating_examples
feature_record.not_active = non_activating_examples
feature_record.extra_examples = non_activating_examples
feature_record.explanation = request.explanation

client = OpenRouter(api_key=request.openrouter_key, model=request.model)

if request.type == NPScoreFuzzDetectionType.FUZZ:
scorer = FuzzingScorer(
client,
tokenizer=None, # type: ignore
batch_size=5,
verbose=False,
log_prob=False,
)
elif request.type == NPScoreFuzzDetectionType.DETECTION:
scorer = DetectionScorer(
client,
tokenizer=None, # type: ignore
batch_size=5,
verbose=False,
log_prob=False,
)
else:
raise HTTPException(status_code=400, detail="Invalid scoring type")

result: ScorerResult = await scorer.__call__(feature_record) # type: ignore
result: ScorerResult = await scorer.__call__(feature_record)
score = per_feature_scores_fuzz_detection(result.score)

breakdown = [
Expand Down
22 changes: 7 additions & 15 deletions apps/autointerp/neuronpedia_autointerp/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,23 @@

def per_feature_scores_embedding(score_data: list[dict[Any, Any]]) -> float:
data_df = pd.DataFrame(score_data)
data_df["ground_truth"] = data_df["distance"] > 0
auc_score = float(roc_auc_score(data_df["ground_truth"], data_df["similarity"]))
data_df["activating"] = data_df["distance"] > 0
auc_score = float(roc_auc_score(data_df["activating"], data_df["similarity"]))
return auc_score # noqa: RET504


def calculate_balanced_accuracy(dataframe: pd.DataFrame) -> float:
tp = len(
dataframe[(dataframe["ground_truth"] == True) & (dataframe["correct"] == True)]
dataframe[(dataframe["activating"] == True) & (dataframe["correct"] == True)]
)
tn = len(
dataframe[(dataframe["ground_truth"] == False) & (dataframe["correct"] == True)]
dataframe[(dataframe["activating"] == False) & (dataframe["correct"] == True)]
)
fp = len(
dataframe[
(dataframe["ground_truth"] == False) & (dataframe["correct"] == False)
]
dataframe[(dataframe["activating"] == False) & (dataframe["correct"] == False)]
)
fn = len(
dataframe[(dataframe["ground_truth"] == True) & (dataframe["correct"] == False)]
dataframe[(dataframe["activating"] == True) & (dataframe["correct"] == False)]
)
recall = 0 if tp + fn == 0 else tp / (tp + fn)
return 0 if tn + fp == 0 else (recall + tn / (tn + fp)) / 2
Expand All @@ -50,18 +48,12 @@ def per_feature_scores_fuzz_detection(
def convert_classifier_output_to_score_classifier_output(
classifier_output: ScoreFuzzDetectionPost200ResponseBreakdownInner,
) -> ScoreFuzzDetectionPost200ResponseBreakdownInner:
# if prediction is -1, count it as false (it's an error state)
# https://github.com/EleutherAI/sae-auto-interp/issues/46
# TODO: fix this in sae-auto-interp - it should be a boolean as specified in: https://github.com/EleutherAI/sae-auto-interp/blob/3659ff3bfefbe2628d37484e5bcc0087a5b10a27/sae_auto_interp/scorers/classifier/sample.py#L19
if classifier_output.prediction == -1:
classifier_output.prediction = False
return ScoreFuzzDetectionPost200ResponseBreakdownInner(
str_tokens=classifier_output.str_tokens,
activations=classifier_output.activations,
distance=classifier_output.distance,
ground_truth=classifier_output.ground_truth,
activating=classifier_output.activating,
prediction=bool(classifier_output.prediction),
highlighted=classifier_output.highlighted,
probability=classifier_output.probability,
correct=classifier_output.correct,
)
Expand Down
Loading