diff --git a/README.md b/README.md index 33ec270e8..f58f48b37 100644 --- a/README.md +++ b/README.md @@ -96,30 +96,30 @@ For each behavior, we can evaluate the model on the following test sets: ```bash # Generate steering vectors for layers of the model for a certain behavior -python generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy +./generate_vectors.py --layers $(seq 0 31) --save_activations --model_size "7b" --behaviors sycophancy # Normalize steering vectors per layer to have the same norm -python normalize_vectors.py +./normalize_vectors.py # Evaluate model on A/B, open-ended or TruthfulQA test sets while using CAA -python prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b" -python prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos +./prompting_with_steering.py --behaviors sycophancy --layers $(seq 0 31) --multipliers -1 0 1 --type ab --model_size "7b" +./prompting_with_steering.py --behaviors sycophancy --layers 13 --multipliers -2 -1.5 -1 -0.5 0 0.5 1 1.5 2 --type ab --model_size "7b" --system_prompt pos # Plot PCA of constrastive activations -python plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b" +./plot_activations.py --behaviors sycophancy --layers $(seq 0 31) --model_size "7b" # Plot results of CAA steering effect -python plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab -python plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab +./plot_results.py --layers $(seq 0 31) --multipliers 1 --type ab +./plot_results.py --layers $(seq 0 31) --multipliers -1 0 1 --behaviors sycophancy --type ab # Finetune a llama on a behavioral dataset using supervised finetuning on the A/B tokens -python finetune_llama.py --behavior sycophancy --direction pos +./finetune_llama.py --behavior sycophancy --direction pos # Plot similarites of steering vectors -python analyze_vectors.py +./analyze_vectors.py # Use GPT-4 to score open-ended responses -python scoring.py +./scoring.py ``` ## Running tests diff --git a/analyze_vectors.py b/analyze_vectors.py old mode 100644 new mode 100755 index 231d8dee2..e7ad66a51 --- a/analyze_vectors.py +++ b/analyze_vectors.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Usage: python analyze_vectors.py """ diff --git a/finetune_llama.py b/finetune_llama.py old mode 100644 new mode 100755 index d83d26f67..6ae2fe951 --- a/finetune_llama.py +++ b/finetune_llama.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Finetune the Llama model on the answer behavior dataset. diff --git a/generate_vectors.py b/generate_vectors.py old mode 100644 new mode 100755 index 24a33a7f9..1d3c34111 --- a/generate_vectors.py +++ b/generate_vectors.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Generates steering vectors for each layer of the model by averaging the activations of all the positive and negative examples. diff --git a/normalize_vectors.py b/normalize_vectors.py old mode 100644 new mode 100755 index 8c1fe502b..12daeb241 --- a/normalize_vectors.py +++ b/normalize_vectors.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python from behaviors import ALL_BEHAVIORS, get_vector_path from utils.helpers import get_model_path import torch as t @@ -36,4 +37,4 @@ def normalize_vectors(model_size: str, is_base: bool, n_layers: int): if __name__ == "__main__": normalize_vectors("7b", True, 32) normalize_vectors("7b", False, 32) - normalize_vectors("13b", False, 36) \ No newline at end of file + normalize_vectors("13b", False, 36) diff --git a/plot_activations.py b/plot_activations.py old mode 100644 new mode 100755 index 22d4f2db3..9d0e1ef46 --- a/plot_activations.py +++ b/plot_activations.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Script to plot PCA of constrastive activations diff --git a/plot_results.py b/plot_results.py old mode 100644 new mode 100755 index 143a7fc3a..10020c4be --- a/plot_results.py +++ b/plot_results.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Plot results from behavioral evaluations under steering. diff --git a/process_raw_datasets.py b/process_raw_datasets.py old mode 100644 new mode 100755 index 7e5e749b3..e9d7d8fa1 --- a/process_raw_datasets.py +++ b/process_raw_datasets.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import os import json import random diff --git a/prompting_with_steering.py b/prompting_with_steering.py old mode 100644 new mode 100755 index 229ec2f8d..9315fc578 --- a/prompting_with_steering.py +++ b/prompting_with_steering.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ Use CAA to steer the model diff --git a/scoring.py b/scoring.py old mode 100644 new mode 100755 index 38b0bb8bc..d15cec67a --- a/scoring.py +++ b/scoring.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python import sys import os current = os.path.dirname(os.path.realpath(__file__)) @@ -100,4 +101,4 @@ def print_avg_score_util(file, score_key="score"): if __name__ == "__main__": # scoring() - scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True) \ No newline at end of file + scoring(behaviors=["hallucination", "myopic-reward"], overwrite=True)