Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Supporting the new tongue tied Gandalf levels #356

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
a241984
Adding support for first two levels of Gandalf Tongue Tied
Sep 3, 2024
e3deeb2
Merge branch 'main' into tongue-tied/two-levels
Sep 3, 2024
0e1f325
adding tests for tongue tied scorer and removing duplicate target
Sep 3, 2024
ad84fac
removing stray breakpoint
Sep 3, 2024
f236ba8
FEAT Add SQL Entra Auth for Azure SQL Server (#330)
elgertam Sep 4, 2024
3d61482
[MAINT] Fix typos in OllamaChatTarget (#357)
riedgar-ms Sep 4, 2024
b4121e7
Reuse original Gandalf target and use built-in scorer for Tongue Tied…
s-zanella Sep 5, 2024
8b575b2
Fix typo
s-zanella Sep 5, 2024
ed68c14
Use correct target for the attacker. Use method `_create_normalizer_r…
s-zanella Sep 5, 2024
89ac0af
Remove unused `threshold` parameter in `TrueFalseInverterScore`
s-zanella Sep 5, 2024
10b5e30
Tongue Tied Gandalf notebook (WIP): ad-hoc orchestrator, built-in sco…
s-zanella Sep 5, 2024
7518705
resolve conflict from origin
Sep 5, 2024
87c1eef
Merge branch 'main' into tongue-tied/two-levels
s-zanella Sep 5, 2024
7bbe9ed
Prompt that solves level 2
s-zanella Sep 5, 2024
9b01377
gandalf_tongue_tied_scorer -> GandalfTongueTiedScorer
s-zanella Sep 5, 2024
ba9a36a
Line endings
s-zanella Sep 5, 2024
87fcb31
Add pct file for notebook. Prompt that solves level 3
s-zanella Sep 6, 2024
54267c1
Merge branch 'main' into tongue-tied/two-levels
s-zanella Sep 6, 2024
1611538
Merge branch 'main' into tongue-tied/two-levels
s-zanella Sep 11, 2024
bc7fa49
Simplify answer parsing; fix mypy type errors; trim whitespace
s-zanella Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added assets/gandalf-pipeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/gandalf-tongue-tied-level-1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
479 changes: 479 additions & 0 deletions doc/code/targets/gandalf_tongue_tied.ipynb

Large diffs are not rendered by default.

20 changes: 3 additions & 17 deletions pyrit/orchestrator/pair_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,16 +142,8 @@ async def _get_attacker_response_and_store(
)
# Send a new request to the attacker
attacker_response = await self._prompt_normalizer.send_prompt_async(
normalizer_request=NormalizerRequest(
request_pieces=[
NormalizerRequestPiece(
request_converters=self._prompt_converters,
prompt_value=target_response,
prompt_data_type="text",
)
]
),
target=self._prompt_target,
normalizer_request=self._create_normalizer_request(prompt_text=target_response),
target=self._adversarial_target,
conversation_id=self._last_attacker_conversation_id,
labels=self._global_memory_labels,
orchestrator_identifier=self.get_identifier(),
Expand All @@ -173,13 +165,7 @@ async def _get_target_response_and_store(
"""
curr_conversation_id = conversation_id or str(uuid.uuid4())
target_response = await self._prompt_normalizer.send_prompt_async(
normalizer_request=NormalizerRequest(
request_pieces=[
NormalizerRequestPiece(
request_converters=self._prompt_converters, prompt_value=text, prompt_data_type="text"
)
]
),
normalizer_request=self._create_normalizer_request(prompt_text=text),
target=self._prompt_target,
conversation_id=curr_conversation_id,
labels=self._global_memory_labels,
Expand Down
2 changes: 1 addition & 1 deletion pyrit/orchestrator/prompt_sending_orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
class PromptSendingOrchestrator(Orchestrator):
"""
This orchestrator takes a set of prompts, converts them using the list of PromptConverters,
sends them to a target, and scores the resonses with scorers (if provided).
sends them to a target, and scores the responses with scorers (if provided).
"""

def __init__(
Expand Down
26 changes: 5 additions & 21 deletions pyrit/prompt_target/gandalf_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ class GandalfLevel(enum.Enum):
LEVEL_8 = "gandalf-the-white"
LEVEL_9 = "adventure-1"
LEVEL_10 = "adventure-2"
TONGUE_TIED_LEVEL_1 = "adventure-8"
TONGUE_TIED_LEVEL_2 = "adventure-9"
TONGUE_TIED_LEVEL_3 = "adventure-10"
TONGUE_TIED_LEVEL_4 = "adventure-11"
TONGUE_TIED_LEVEL_5 = "adventure-12"


class GandalfTarget(PromptTarget):
Expand Down Expand Up @@ -60,27 +65,6 @@ def _validate_request(self, *, prompt_request: PromptRequestResponse) -> None:
if prompt_request.request_pieces[0].converted_value_data_type != "text":
raise ValueError("This target only supports text prompt input.")

async def check_password(self, password: str) -> bool:
"""
Checks if the password is correct

True means the password is correct, False means it is not
"""
payload: dict[str, object] = {
"defender": self._defender,
"password": password,
}

resp = await net_utility.make_request_and_raise_if_error_async(
endpoint_uri=self._endpoint, method="POST", request_body=payload, post_type="data"
)

if not resp.text:
raise ValueError("The chat returned an empty response.")

json_response = resp.json()
return json_response["success"]

async def _complete_text_async(self, text: str) -> str:
payload: dict[str, object] = {
"defender": self._defender,
Expand Down
87 changes: 44 additions & 43 deletions pyrit/score/__init__.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,44 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from pyrit.models import Score, ScoreType
from pyrit.score.scorer import Scorer

from pyrit.score.azure_content_filter_scorer import AzureContentFilterScorer
from pyrit.score.float_scale_threshold_scorer import FloatScaleThresholdScorer
from pyrit.score.gandalf_scorer import GandalfScorer
from pyrit.score.human_in_the_loop_scorer import HumanInTheLoopScorer
from pyrit.score.markdown_injection import MarkdownInjectionScorer
from pyrit.score.prompt_shield_scorer import PromptShieldScorer
from pyrit.score.self_ask_category_scorer import SelfAskCategoryScorer, ContentClassifierPaths
from pyrit.score.self_ask_likert_scorer import SelfAskLikertScorer, LikertScalePaths
from pyrit.score.self_ask_scale_scorer import SelfAskScaleScorer, ScalePaths
from pyrit.score.self_ask_true_false_scorer import SelfAskTrueFalseScorer, TrueFalseQuestionPaths
from pyrit.score.substring_scorer import SubStringScorer
from pyrit.score.true_false_inverter_scorer import TrueFalseInverterScorer


__all__ = [
"AzureContentFilterScorer",
"ContentClassifierPaths",
"FloatScaleThresholdScorer",
"GandalfScorer",
"HumanInTheLoopScorer",
"LikertScalePaths",
"MarkdownInjectionScorer",
"MetaScorerQuestionPaths",
"ObjectiveQuestionPaths",
"PromptShieldScorer",
"ScalePaths",
"Score",
"ScoreType",
"Scorer",
"SelfAskCategoryScorer",
"SelfAskLikertScorer",
"SelfAskScaleScorer",
"SelfAskTrueFalseScorer",
"SubStringScorer",
"TrueFalseInverterScorer",
"TrueFalseQuestionPaths",
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from pyrit.models import Score, ScoreType
from pyrit.score.scorer import Scorer

from pyrit.score.azure_content_filter_scorer import AzureContentFilterScorer
from pyrit.score.float_scale_threshold_scorer import FloatScaleThresholdScorer
from pyrit.score.gandalf_scorer import GandalfScorer, GandalfTongueTiedScorer
from pyrit.score.human_in_the_loop_scorer import HumanInTheLoopScorer
from pyrit.score.markdown_injection import MarkdownInjectionScorer
from pyrit.score.prompt_shield_scorer import PromptShieldScorer
from pyrit.score.self_ask_category_scorer import SelfAskCategoryScorer, ContentClassifierPaths
from pyrit.score.self_ask_likert_scorer import SelfAskLikertScorer, LikertScalePaths
from pyrit.score.self_ask_scale_scorer import SelfAskScaleScorer, ScalePaths
from pyrit.score.self_ask_true_false_scorer import SelfAskTrueFalseScorer, TrueFalseQuestionPaths
from pyrit.score.substring_scorer import SubStringScorer
from pyrit.score.true_false_inverter_scorer import TrueFalseInverterScorer


__all__ = [
"AzureContentFilterScorer",
"ContentClassifierPaths",
"FloatScaleThresholdScorer",
"GandalfScorer",
"GandalfTongueTiedScorer",
"HumanInTheLoopScorer",
"LikertScalePaths",
"MarkdownInjectionScorer",
"MetaScorerQuestionPaths",
"ObjectiveQuestionPaths",
"PromptShieldScorer",
"ScalePaths",
"Score",
"ScoreType",
"Scorer",
"SelfAskCategoryScorer",
"SelfAskLikertScorer",
"SelfAskScaleScorer",
"SelfAskTrueFalseScorer",
"SubStringScorer",
"TrueFalseInverterScorer",
"TrueFalseQuestionPaths",
]
Loading