nrimsky · twilligon · Feb 14, 2024
diff --git a/README.md b/README.md
@@ -96,30 +96,30 @@ For each behavior, we can evaluate the model on the following test sets:
 
 ```bash
 # Generate steering vectors for layers of the model for a certain behavior
-python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy
+./generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy
 
 # Normalize steering vectors per layer to have the same norm
-python normalize_vectors.py
+./normalize_vectors.py
 
 # Evaluate model on A/B, open-ended or TruthfulQA test sets while using CAA
-python prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b"
-python prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos
+./prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b"
+./prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos
 
 # Plot PCA of constrastive activations
-python plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b"
+./plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b"
 
 # Plot results of CAA steering effect
-python plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab
-python plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab
+./plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab
+./plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab
 
 # Finetune a llama on a behavioral dataset using supervised finetuning on the A/B tokens
-python finetune_llama.py --behavior sycophancy --direction pos
+./finetune_llama.py --behavior sycophancy --direction pos
 
 # Plot similarites of steering vectors
-python analyze_vectors.py
+./analyze_vectors.py
 
 # Use GPT-4 to score open-ended responses
-python scoring.py
+./scoring.py
 ```
 
 ## Running tests

diff --git a/analyze_vectors.py b/analyze_vectors.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Usage: python analyze_vectors.py
 """

diff --git a/finetune_llama.py b/finetune_llama.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Finetune the Llama model on the answer behavior dataset.
 

diff --git a/generate_vectors.py b/generate_vectors.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Generates steering vectors for each layer of the model by averaging the activations of all the positive and negative examples.
 

diff --git a/normalize_vectors.py b/normalize_vectors.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 from behaviors import ALL_BEHAVIORS, get_vector_path
 from utils.helpers import get_model_path
 import torch as t
@@ -36,4 +37,4 @@ def normalize_vectors(model_size: str, is_base: bool, n_layers: int):
 if __name__ == "__main__":
     normalize_vectors("7b", True, 32)
     normalize_vectors("7b", False, 32)
-    normalize_vectors("13b", False, 36)
+    normalize_vectors("13b", False, 36)
diff --git a/plot_activations.py b/plot_activations.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Script to plot PCA of constrastive activations
 

diff --git a/plot_results.py b/plot_results.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Plot results from behavioral evaluations under steering.
 

diff --git a/process_raw_datasets.py b/process_raw_datasets.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import os
 import json
 import random

diff --git a/prompting_with_steering.py b/prompting_with_steering.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 """
 Use CAA to steer the model
 

diff --git a/scoring.py b/scoring.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import sys
 import os
 current = os.path.dirname(os.path.realpath(__file__))
@@ -100,4 +101,4 @@ def print_avg_score_util(file, score_key="score"):
 
 if __name__ == "__main__":
     # scoring()
-    scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True)
+    scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True)