Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,30 +96,30 @@ For each behavior, we can evaluate the model on the following test sets:

```bash
# Generate steering vectors for layers of the model for a certain behavior
python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy
./generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy

# Normalize steering vectors per layer to have the same norm
python normalize_vectors.py
./normalize_vectors.py

# Evaluate model on A/B, open-ended or TruthfulQA test sets while using CAA
python prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b"
python prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos
./prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b"
./prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos

# Plot PCA of constrastive activations
python plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b"
./plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b"

# Plot results of CAA steering effect
python plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab
python plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab
./plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab
./plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab

# Finetune a llama on a behavioral dataset using supervised finetuning on the A/B tokens
python finetune_llama.py --behavior sycophancy --direction pos
./finetune_llama.py --behavior sycophancy --direction pos

# Plot similarites of steering vectors
python analyze_vectors.py
./analyze_vectors.py

# Use GPT-4 to score open-ended responses
python scoring.py
./scoring.py
```

## Running tests
Expand Down
1 change: 1 addition & 0 deletions analyze_vectors.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Usage: python analyze_vectors.py
"""
Expand Down
1 change: 1 addition & 0 deletions finetune_llama.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Finetune the Llama model on the answer behavior dataset.

Expand Down
1 change: 1 addition & 0 deletions generate_vectors.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Generates steering vectors for each layer of the model by averaging the activations of all the positive and negative examples.

Expand Down
3 changes: 2 additions & 1 deletion normalize_vectors.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
from behaviors import ALL_BEHAVIORS, get_vector_path
from utils.helpers import get_model_path
import torch as t
Expand Down Expand Up @@ -36,4 +37,4 @@ def normalize_vectors(model_size: str, is_base: bool, n_layers: int):
if __name__ == "__main__":
normalize_vectors("7b", True, 32)
normalize_vectors("7b", False, 32)
normalize_vectors("13b", False, 36)
normalize_vectors("13b", False, 36)
1 change: 1 addition & 0 deletions plot_activations.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Script to plot PCA of constrastive activations

Expand Down
1 change: 1 addition & 0 deletions plot_results.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Plot results from behavioral evaluations under steering.

Expand Down
1 change: 1 addition & 0 deletions process_raw_datasets.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
import os
import json
import random
Expand Down
1 change: 1 addition & 0 deletions prompting_with_steering.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
"""
Use CAA to steer the model

Expand Down
3 changes: 2 additions & 1 deletion scoring.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python
import sys
import os
current = os.path.dirname(os.path.realpath(__file__))
Expand Down Expand Up @@ -100,4 +101,4 @@ def print_avg_score_util(file, score_key="score"):

if __name__ == "__main__":
# scoring()
scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True)
scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True)