diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py index d2a41cb1e..0f82269d0 100644 --- a/pyrit/datasets/__init__.py +++ b/pyrit/datasets/__init__.py @@ -14,6 +14,7 @@ fetch_many_shot_jailbreaking_dataset, fetch_pku_safe_rlhf_dataset, fetch_seclists_bias_testing_dataset, + fetch_red_team_social_bias_prompts_dataset, fetch_tdc23_redteaming_dataset, fetch_wmdp_dataset, fetch_xstest_dataset, @@ -31,6 +32,7 @@ "fetch_adv_bench_dataset", "fetch_wmdp_dataset", "fetch_forbidden_questions_dataset", + "fetch_red_team_social_bias_prompts_dataset", "fetch_llm_latent_adversarial_training_harmful_dataset", "fetch_tdc23_redteaming_dataset", "fetch_librAI_do_not_answer_dataset", diff --git a/pyrit/datasets/fetch_example_datasets.py b/pyrit/datasets/fetch_example_datasets.py index 19780862b..943358076 100644 --- a/pyrit/datasets/fetch_example_datasets.py +++ b/pyrit/datasets/fetch_example_datasets.py @@ -494,6 +494,102 @@ def fetch_decoding_trust_stereotypes_dataset( return seed_prompt_dataset +def fetch_red_team_social_bias_prompts_dataset() -> SeedPromptDataset: + """ + Fetch Red Team Social Bias Prompts and create a SeedPromptDataset. + + Relevant Columns: + - "organization" + - "categorization" + - "prompt_type" + - "prompt_instruction" + - "prompt" + - "explanation" + - "ai_response" + + Returns: + SeedPromptDataset: A SeedPromptDataset containing the examples. + + Note: + For more information and access to the original dataset and related materials, visit: + https://svannie678.github.io/svannie678-red_team_repo_social_bias. This dataset + contains aggregated and unified existing red-teaming prompts designed to identify + stereotypes, discrimination, hate speech, and other representationharms in text-based + Large Language Models (LLMs). + Written by Simone Van Taylor. + """ + data = load_dataset("svannie678/red_team_repo_social_bias_prompts", "default") + + # Define common metadata that will be used across all seed prompts + common_metadata = { + "dataset_name": "svannie678/red_team_repo_social_bias_prompts", + "authors": ["Simone van Taylor"], + "description": """This dataset contains aggregated and unified existing + red-teaming prompts designed to identify stereotypes, + discrimination, hate speech, and other representation + harms in text-based Large Language Models (LLMs).""", + "source": "https://huggingface.co/datasets/svannie678/red_team_repo_social_bias_prompts" + } + + # Initialize an empty list to store the seed prompts + seed_prompts = [] + + for group_id, item in enumerate(data["train"]): + + # This dataset contains 3 prompt types: "Single Prompt", "Multi Turn" and "Multi Turn, Single Prompt" + # We're only checking for "Multi Turn" because the "Multi Turn, Single Prompt" + # prompt types inside this dataset will be difficult to parse + prompt_type = item.get("prompt_type") + + if prompt_type is None: + continue + + # Dictionary of metadata for the current prompt + prompt_metadata = { + **common_metadata, + "harm_categories": [item["categorization"]] if not isinstance(item.get("categorization"), list) else item.get("categorization", []), + "metadata": { + "organization": item.get("organization", ""), + "prompt_type": prompt_type, + "prompt_instruction": item.get("prompt_instruction", ""), + "explanation": item.get("explanation", ""), + "ai_response": item.get("ai_response", ""), + }, + } + + if prompt_type in ["Multi Turn"]: + try: + # Safely parse the user prompts, remove the unwanted ones such as "assistant" and "system" + user_prompts = [turn["body"] for turn in ast.literal_eval(item.get("prompt", "")) if turn["role"].startswith("user")] + + for i, user_prompt in enumerate(user_prompts): + seed_prompts.append( + SeedPrompt( + value=user_prompt, + data_type="text", + prompt_group_id=group_id, + sequence=i, + **prompt_metadata + ) + ) + except Exception as e: + print(f"Error processing Multi-Turn Prompt: {e}") + else: + # Clean up non-"Multi Turn" prompts that contain unwanted lines of text + cleaned_value = item.get("prompt", "").replace("### Response:", "").replace("### Instruction:", "").strip() + + seed_prompts.append( + SeedPrompt( + value=cleaned_value, + data_type="text", + **prompt_metadata + ) + ) + + seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts) + return seed_prompt_dataset + + def fetch_adv_bench_dataset( source: str = ( "https://raw.githubusercontent.com/llm-attacks/llm-attacks/main/data/advbench/" "harmful_behaviors.csv"