Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Added Red Team Social Bias dataset #714

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions doc/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ chapters:
- file: code/orchestrators/benchmark_orchestrator
- file: code/orchestrators/flip_orchestrator
- file: code/orchestrators/fuzzing_jailbreak_templates
- file: code/orchestrators/red_team_social_bias_prompt_orchestrator
- file: code/orchestrators/HITL_Scoring_Orchestrator
- file: code/orchestrators/librAI_do_not_answer
- file: code/orchestrators/many_shot_jailbreak
Expand Down
2 changes: 2 additions & 0 deletions pyrit/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
fetch_many_shot_jailbreaking_dataset,
fetch_pku_safe_rlhf_dataset,
fetch_seclists_bias_testing_dataset,
fetch_red_team_social_bias_prompts_dataset,
fetch_tdc23_redteaming_dataset,
fetch_wmdp_dataset,
fetch_xstest_dataset,
Expand All @@ -31,6 +32,7 @@
"fetch_adv_bench_dataset",
"fetch_wmdp_dataset",
"fetch_forbidden_questions_dataset",
"fetch_red_team_social_bias_prompts_dataset",
"fetch_llm_latent_adversarial_training_harmful_dataset",
"fetch_tdc23_redteaming_dataset",
"fetch_librAI_do_not_answer_dataset",
Expand Down
96 changes: 96 additions & 0 deletions pyrit/datasets/fetch_example_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,102 @@ def fetch_decoding_trust_stereotypes_dataset(
return seed_prompt_dataset


def fetch_red_team_social_bias_prompts_dataset() -> SeedPromptDataset:
"""
Fetch Red Team Social Bias Prompts and create a SeedPromptDataset.

Relevant Columns:
- "organization"
- "categorization"
- "prompt_type"
- "prompt_instruction"
- "prompt"
- "explanation"
- "ai_response"

Returns:
SeedPromptDataset: A SeedPromptDataset containing the examples.

Note:
For more information and access to the original dataset and related materials, visit:
https://svannie678.github.io/svannie678-red_team_repo_social_bias. This dataset
contains aggregated and unified existing red-teaming prompts designed to identify
stereotypes,discrimination, hate speech, and other representationharms in text-based
Large Language Models (LLMs).
Written by Simone Van Taylor.
"""
data = load_dataset("svannie678/red_team_repo_social_bias_prompts", "default")

# Define common metadata that will be used across all seed prompts
common_metadata = {
"dataset_name": "svannie678/red_team_repo_social_bias_prompts",
"authors": ["Simone van Taylor"],
"description": """This dataset contains aggregated and unified existing
red-teaming prompts designed to identify stereotypes,
discrimination, hate speech, and other representation
harms in text-based Large Language Models (LLMs).""",
"source": "https://huggingface.co/datasets/svannie678/red_team_repo_social_bias_prompts"
}

# Initialize an empty list to store the seed prompts
seed_prompts = []

for group_id, item in enumerate(data["train"]):

# This dataset contains 3 prompt types: "Single Prompt", "Multi Turn" and "Multi Turn, Single Prompt"
# We're only checking for "Multi Turn" because the "Multi Turn, Single Prompt"
# prompt types inside this dataset will be difficult to parse
prompt_type = item.get("prompt_type")

if prompt_type is None:
continue

# Dictionary of metadata for the current prompt
prompt_metadata = {
**common_metadata,
"harm_categories": item.get("categorization", []),
"metadata": {
"organization": item.get("organization", ""),
"prompt_type": prompt_type,
"prompt_instruction": item.get("prompt_instruction", ""),
"explanation": item.get("explanation", ""),
"ai_response": item.get("ai_response", ""),
},
}

if prompt_type in ["Multi Turn"]:
try:
# Safely parse the user prompts, remove the unwanted ones such as "assistant" and "system"
user_prompts = [turn["body"] for turn in ast.literal_eval(item.get("prompt", "")) if turn["role"].startswith("user")]

for i, user_prompt in enumerate(user_prompts):
seed_prompts.append(
SeedPrompt(
value=user_prompt,
data_type="text",
prompt_group_id=group_id,
sequence=i,
**prompt_metadata
)
)
except Exception as e:
print(f"Error processing Multi-Turn Prompt: {e}")
else:
# Clean up non-"Multi Turn" prompts that contain unwanted lines of text
cleaned_value = item.get("prompt", "").replace("### Response:", "").replace("### Instruction:", "").strip()

seed_prompts.append(
SeedPrompt(
value=cleaned_value,
data_type="text",
**prompt_metadata
)
)

seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
return seed_prompt_dataset


def fetch_adv_bench_dataset(
source: str = (
"https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/" "harmful_behaviors.csv"
Expand Down