Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] FEAT support adversarial_chat and scoring in scanner to enable automated multi-turn-orchestrators #706

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 48 additions & 8 deletions pyrit/cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,11 +77,11 @@ async def validate_config_and_run_async(config: Dict[str, Any], memory_labels: O
objective_target = validate_target(config, target_key="objective_target")
prompt_converters: list[PromptConverter] = []
# prompt_converters = validate_converters(config)
scorer = None
# TODO: need to find a solution for single/multiple scorers and scoring_targets
# scorers = validate_scorers(config)
adversarial_chat = None
# adversarial_chat = validate_adversarial_chat(config)
if "adversarial_chat" in config:
adversarial_chat = validate_target(config, target_key="adversarial_chat")
scoring_target = validate_scoring_target(config, adversarial_chat=adversarial_chat)
objective_scorer = validate_objective_scorer(config, scoring_target=scoring_target)

orchestrators = []
for scenario_config in scenarios:
Expand All @@ -91,7 +91,8 @@ async def validate_config_and_run_async(config: Dict[str, Any], memory_labels: O
objective_target=objective_target,
adversarial_chat=adversarial_chat,
prompt_converters=prompt_converters,
scorer=scorer,
scoring_target=scoring_target,
objective_scorer=objective_scorer,
)
)

Expand Down Expand Up @@ -130,7 +131,8 @@ def validate_scenario(
objective_target: PromptTarget,
adversarial_chat: Optional[PromptChatTarget] = None,
prompt_converters: Optional[List[PromptConverter]] = None,
scorer: Optional[Scorer] = None,
scoring_target: Optional[PromptChatTarget] = None,
objective_scorer: Optional[Scorer] = None,
) -> Orchestrator:
if "type" not in scenario_config:
raise KeyError("Scenario must contain a 'type' key.")
Expand All @@ -150,7 +152,7 @@ def validate_scenario(

# Some orchestrator arguments have their own configuration since they
# are more complex. They are passed in as args to this function.
complex_arg_names = ["objective_target", "adversarial_chat", "prompt_converters", "scorer"]
complex_arg_names = ["objective_target", "adversarial_chat", "prompt_converters", "scoring_target", "objective_scorer"]
for complex_arg_name in complex_arg_names:
if complex_arg_name in scenario_args:
raise ValueError(
Expand All @@ -167,7 +169,7 @@ def validate_scenario(

orchestrator = orchestrator_class(**scenario_args)
except Exception as ex:
raise ValueError(f"Failed to validate scenario {scenario_type}") from ex
raise ValueError(f"Failed to validate scenario {scenario_type}: {ex}") from ex
return orchestrator


Expand Down Expand Up @@ -207,6 +209,44 @@ def validate_target(config: Dict[str, Any], target_key: str) -> PromptTarget:
return target


def validate_scoring_target(config: Dict[str, Any], adversarial_chat: Optional[PromptChatTarget]) -> PromptChatTarget | None:
if "scoring" not in config:
return None
scoring_config = config["scoring"]

# If a scoring_target has been configured use it.
# Otherwise, use the adversarial_chat target for scoring.
if "scoring_target" in scoring_config:
return validate_target(scoring_config, target_key="scoring_target")
return adversarial_chat


def validate_objective_scorer(config: Dict[str, Any], scoring_target: Optional[PromptChatTarget]) -> Scorer | None:
if "scoring" not in config:
return None
scoring_config = config["scoring"]
if "objective_scorer" not in scoring_config:
return None

scorer_args = deepcopy(scoring_config["objective_scorer"])

if "type" not in scorer_args:
raise KeyError("Scorer definition must contain a 'type' key.")

scorer_type = scorer_args.pop("type")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: this pop is clean as opposed to the del in validate_target above, can we change validate_target to match this?


try:
scorer_module = import_module("pyrit.score")
scorer_class = getattr(scorer_module, scorer_type)
except Exception as ex:
raise RuntimeError(f"Failed to import target {scorer_type} from pyrit.score") from ex

if scoring_target and "chat_target" in inspect.signature(scorer_class.__init__).parameters:
scorer_args["chat_target"] = scoring_target

return scorer_class(**scorer_args)


def main(args=None):
parsed_args = parse_args(args)
config_file = parsed_args.config_file
Expand Down
33 changes: 33 additions & 0 deletions scanner_configurations/basic_multi_turn_attack.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
# - type: "CrescendoOrchestrator"
# - type: "TreeOfAttacksWithPruningOrchestrator"
# depth: 2
objective_target:
type: "OpenAIChatTarget" # "AzureMLChatTarget" | "HuggingFaceEndpointTarget" | ...
# endpoint_env_variable: # in case one wants to use multiple you need to specify the env vars - not yet supported
# api_key_env_variable:
# any arg for targets can be listed here:
# deployment_name_env_variable:
# headers:
is_azure_target: true
# converters:
# - type: "Base64Converter"
# - type: "LeetspeakConverter"
adversarial_chat:
type: "OpenAIChatTarget"
is_azure_target: true
scoring:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if this is too broad or has been answered already, but if in the future, we want to run RTO and Crescendo each with a different scorer, how could we do that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In different files...

# scoring_target is optional. If a target is required but not provided, the adversarial_chat will be used for scoring
# scoring_target:
# type: "OpenAIChatTarget"
objective_scorer:
type: "SelfAskRefusalScorer"
memory_labels:
operator: roakey
operation: op_trash_panda
execution_settings:
type: local # or "azureml"
# parallel_nodes: 4 # how many scenarios to execute in parallel
11 changes: 3 additions & 8 deletions scanner_configurations/prompt_send.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "PromptSendingOrchestrator"
# - type: "CrescendoOrchestrator"
# - type: "TreeOfAttackWithPruningOrchestrator"
objective_target:
type: "OpenAIChatTarget" # "AzureMLChatTarget" | "HuggingFaceEndpointTarget" | ...
# endpoint_env_variable: # in case one wants to use multiple you need to specify the env vars - not yet supported
Expand All @@ -15,14 +13,11 @@ objective_target:
# converters:
# - type: "Base64Converter"
# - type: "LeetspeakConverter"
# adversarial_chat:
# type: "AzureMLChatTarget"
# ...
# scoring:
# scorer: ...
# objective_scorer: ...
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For PSO we can technically accept "scorers" (plural). Options I considered so far:

  • just take 1 scorer (the current version) and perhaps allow additional_scorers which can take a list
  • take a list of scorers, and only the first one is passed to orchestrators that need a single scorer

Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the first option a bit better. Seems like most of the time (maybe I'm wrong!) people are just using one scorer so could be good to optimize for that experience. And then having the additional_scorers would be a nice clear place to add others for anyone experimenting with more than one.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, mostly because it's the happy path and it makes it clear what the important one is.

memory_labels:
operator: romanlutz
operation: scanner_setup_jan25
operator: roakey
operation: op_trash_panda
execution_settings:
type: local # or "azureml"
# parallel_nodes: 4 # how many scenarios to execute in parallel
28 changes: 28 additions & 0 deletions tests/unit/cli/mixed_multiple_orchestrators_args_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "PromptSendingOrchestrator"
- type: "RedTeamingOrchestrator"
- type: "CrescendoOrchestrator"
max_turns: 5
max_backtracks: 3
- type: "CrescendoOrchestrator"
max_turns: 10
max_backtracks: 10
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 10
width: 7
branching_factor: 8
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 5
width: 3
branching_factor: 6
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
objective_scorer:
type: "SelfAskRefusalScorer"
13 changes: 13 additions & 0 deletions tests/unit/cli/multi_turn_crescendo_args_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "CrescendoOrchestrator"
max_turns: 8
max_backtracks: 4
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
11 changes: 11 additions & 0 deletions tests/unit/cli/multi_turn_crescendo_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "CrescendoOrchestrator"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
13 changes: 13 additions & 0 deletions tests/unit/cli/multi_turn_crescendo_wrong_arg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "CrescendoOrchestrator"
max_turns: 8
wrong_arg: "wrong"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
27 changes: 27 additions & 0 deletions tests/unit/cli/multi_turn_multiple_orchestrators_args_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
- type: "CrescendoOrchestrator"
max_turns: 5
max_backtracks: 3
- type: "CrescendoOrchestrator"
max_turns: 10
max_backtracks: 10
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 10
width: 7
branching_factor: 8
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 5
width: 3
branching_factor: 6
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
objective_scorer:
type: "SelfAskRefusalScorer"
11 changes: 11 additions & 0 deletions tests/unit/cli/multi_turn_rto_args_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
objective_scorer:
type: "SelfAskRefusalScorer"
11 changes: 11 additions & 0 deletions tests/unit/cli/multi_turn_rto_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
objective_scorer:
type: "SelfAskRefusalScorer"
12 changes: 12 additions & 0 deletions tests/unit/cli/multi_turn_rto_wrong_arg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
wrong_arg: "wrong"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
objective_scorer:
type: "SelfAskRefusalScorer"
14 changes: 14 additions & 0 deletions tests/unit/cli/multi_turn_tap_args_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 10
width: 7
branching_factor: 8
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
11 changes: 11 additions & 0 deletions tests/unit/cli/multi_turn_tap_success.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "TreeOfAttacksWithPruningOrchestrator"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
14 changes: 14 additions & 0 deletions tests/unit/cli/multi_turn_tap_wrong_arg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "TreeOfAttacksWithPruningOrchestrator"
depth: 10
width: 7
wrong_arg: "wrong"
objective_target:
type: "OpenAIChatTarget"
adversarial_chat:
type: "OpenAIChatTarget"
scoring:
scoring_target:
type: "OpenAIChatTarget"
32 changes: 32 additions & 0 deletions tests/unit/cli/multi_turn_template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: "RedTeamingOrchestrator"
# - type: "CrescendoOrchestrator"
# - type: "TreeOfAttacksWithPruningOrchestrator"
# depth: 2
objective_target:
type: "OpenAIChatTarget" # "AzureMLChatTarget" | "HuggingFaceEndpointTarget" | ...
# endpoint_env_variable: # in case one wants to use multiple you need to specify the env vars - not yet supported
# api_key_env_variable:
# any arg for targets can be listed here:
# deployment_name_env_variable:
# headers:
# converters:
# - type: "Base64Converter"
# - type: "LeetspeakConverter"
adversarial_chat:
type: "OpenAIChatTarget"
is_azure_target: true
scoring:
# scoring_target is optional. If a target is required but not provided, the adversarial_chat will be used for scoring
# scoring_target:
# type: "OpenAIChatTarget"
objective_scorer:
type: "SelfAskRefusalScorer"
memory_labels:
operator: roakey
operation: op_trash_panda
execution_settings:
type: local # or "azureml"
# parallel_nodes: 4 # how many scenarios to execute in parallel
2 changes: 1 addition & 1 deletion tests/unit/cli/prompt_send_no_objective_target.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: send_prompts
- type: "PromptSendingOrchestrator"
2 changes: 1 addition & 1 deletion tests/unit/cli/prompt_send_no_objective_target_type.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
datasets:
- ./pyrit/datasets/seed_prompts/illegal.prompt
scenarios:
- type: send_prompts
- type: "PromptSendingOrchestrator"
objective_target:
Loading