Azure · MoolmanM · Feb 20, 2025 · Feb 14, 2025 · Feb 16, 2025 · Feb 20, 2025
diff --git a/doc/_toc.yml b/doc/_toc.yml
@@ -41,6 +41,7 @@ chapters:
       - file: code/orchestrators/benchmark_orchestrator
       - file: code/orchestrators/flip_orchestrator
       - file: code/orchestrators/fuzzing_jailbreak_templates
+      - file: code/orchestrators/red_team_social_bias_prompt_orchestrator
       - file: code/orchestrators/HITL_Scoring_Orchestrator
       - file: code/orchestrators/librAI_do_not_answer
       - file: code/orchestrators/many_shot_jailbreak

diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -14,6 +14,7 @@
     fetch_many_shot_jailbreaking_dataset,
     fetch_pku_safe_rlhf_dataset,
     fetch_seclists_bias_testing_dataset,
+    fetch_red_team_social_bias_prompts_dataset,
     fetch_tdc23_redteaming_dataset,
     fetch_wmdp_dataset,
     fetch_xstest_dataset,
@@ -31,6 +32,7 @@
     "fetch_adv_bench_dataset",
     "fetch_wmdp_dataset",
     "fetch_forbidden_questions_dataset",
+    "fetch_red_team_social_bias_prompts_dataset",
     "fetch_llm_latent_adversarial_training_harmful_dataset",
     "fetch_tdc23_redteaming_dataset",
     "fetch_librAI_do_not_answer_dataset",

diff --git a/pyrit/datasets/fetch_example_datasets.py b/pyrit/datasets/fetch_example_datasets.py
@@ -494,6 +494,102 @@ def fetch_decoding_trust_stereotypes_dataset(
     return seed_prompt_dataset
 
 
+def fetch_red_team_social_bias_prompts_dataset() -> SeedPromptDataset:
+    """
+    Fetch Red Team Social Bias Prompts and create a SeedPromptDataset.
+
+    Relevant Columns:
+    - "organization"
+    - "categorization"
+    - "prompt_type"
+    - "prompt_instruction"
+    - "prompt"
+    - "explanation"
+    - "ai_response"
+
+    Returns:
+        SeedPromptDataset: A SeedPromptDataset containing the examples.
+
+    Note:
+        For more information and access to the original dataset and related materials, visit:
+        https://svannie678.github.io/svannie678-red_team_repo_social_bias. This dataset
+        contains aggregated and unified existing red-teaming prompts designed to identify
+        stereotypes,discrimination, hate speech, and other representationharms in text-based
+        Large Language Models (LLMs).
+        Written by Simone Van Taylor.
+    """
+    data = load_dataset("svannie678/red_team_repo_social_bias_prompts", "default")
+
+    # Define common metadata that will be used across all seed prompts
+    common_metadata = {
+        "dataset_name": "svannie678/red_team_repo_social_bias_prompts",
+        "authors": ["Simone van Taylor"],
+        "description": """This dataset contains aggregated and unified existing
+                red-teaming prompts designed to identify stereotypes,
+                discrimination, hate speech, and other representation
+                harms in text-based Large Language Models (LLMs).""",
+        "source": "https://huggingface.co/datasets/svannie678/red_team_repo_social_bias_prompts"
+    }
+
+    # Initialize an empty list to store the seed prompts
+    seed_prompts = []
+
+    for group_id, item in enumerate(data["train"]):
+
+        # This dataset contains 3 prompt types: "Single Prompt", "Multi Turn" and "Multi Turn, Single Prompt"
+        # We're only checking for "Multi Turn" because the "Multi Turn, Single Prompt" 
+        # prompt types inside this dataset will be difficult to parse
+        prompt_type = item.get("prompt_type")
+
+        if prompt_type is None:
+            continue
+
+        # Dictionary of metadata for the current prompt
+        prompt_metadata = {
+            **common_metadata,
+            "harm_categories": item.get("categorization", []),
+            "metadata": {
+                "organization": item.get("organization", ""),
+                "prompt_type": prompt_type,
+                "prompt_instruction": item.get("prompt_instruction", ""),
+                "explanation": item.get("explanation", ""),
+                "ai_response": item.get("ai_response", ""),
+            },
+        }
+
+        if prompt_type in ["Multi Turn"]:
+            try:
+                # Safely parse the user prompts, remove the unwanted ones such as "assistant" and "system"
+                user_prompts = [turn["body"] for turn in ast.literal_eval(item.get("prompt", "")) if turn["role"].startswith("user")]
+
+                for i, user_prompt in enumerate(user_prompts):
+                    seed_prompts.append(
+                        SeedPrompt(
+                            value=user_prompt,
+                            data_type="text",
+                            prompt_group_id=group_id,
+                            sequence=i,
+                            **prompt_metadata
+                        )
+                    )
+            except Exception as e:
+                print(f"Error processing Multi-Turn Prompt: {e}")
+        else:
+            # Clean up  non-"Multi Turn" prompts that contain unwanted lines of text
+            cleaned_value = item.get("prompt", "").replace("### Response:", "").replace("### Instruction:", "").strip()
+
+            seed_prompts.append(
+                SeedPrompt(
+                    value=cleaned_value,
+                    data_type="text",
+                    **prompt_metadata
+                )
+            )
+
+    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
+    return seed_prompt_dataset
+
+
 def fetch_adv_bench_dataset(
     source: str = (
         "https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/" "harmful_behaviors.csv"