Skip to content

Commit

Permalink
Update server to match sdk interface for modular checks
Browse files Browse the repository at this point in the history
  • Loading branch information
ristomcgehee committed Jan 22, 2024
1 parent 3c12155 commit 8f89671
Show file tree
Hide file tree
Showing 9 changed files with 337 additions and 226 deletions.
18 changes: 2 additions & 16 deletions javascript-sdk/src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,15 @@ export default class RebuffApi implements Rebuff {

async detectInjection({
userInput = "",
maxHeuristicScore = 0.75,
maxVectorScore = 0.9,
maxModelScore = 0.9,
runHeuristicCheck = true,
runVectorCheck = true,
runLanguageModelCheck = true,
tacticOverrides = [],
}: DetectRequest): Promise<DetectResponse> {
if (userInput === null) {
throw new RebuffError("userInput is required");
}
const requestData: DetectRequest = {
userInput: "",
userInputBase64: encodeString(userInput),
runHeuristicCheck: runHeuristicCheck,
runVectorCheck: runVectorCheck,
runLanguageModelCheck: runLanguageModelCheck,
maxVectorScore,
maxModelScore,
maxHeuristicScore,
tacticOverrides,
};

const response = await fetch(`${this.apiUrl}/api/detect`, {
Expand All @@ -76,10 +66,6 @@ export default class RebuffApi implements Rebuff {
if (!response.ok) {
throw new RebuffError((responseData as any)?.message);
}
responseData.injectionDetected =
responseData.heuristicScore > maxHeuristicScore ||
responseData.modelScore > maxModelScore ||
responseData.vectorScore.topScore > maxVectorScore;
return responseData;
}

Expand Down
7 changes: 4 additions & 3 deletions python-sdk/rebuff/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from rebuff._version import __version__

from .rebuff import (
ApiFailureResponse,
DetectApiRequest,
DetectApiSuccessResponse,
DetectResponse,
Rebuff,
TacticName,
TacticOverride,
TacticResult,
)

from .sdk import RebuffSdk, RebuffDetectionResponse
221 changes: 155 additions & 66 deletions python-sdk/rebuff/rebuff.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,143 @@
from enum import Enum
import secrets
from typing import Any, Dict, Optional, Tuple, Union
from typing import List, Optional, Dict, Any, Union, Tuple

import requests
from pydantic import BaseModel


class DetectApiRequest(BaseModel):
userInput: str
userInputBase64: Optional[str] = None
runHeuristicCheck: bool
runVectorCheck: bool
runLanguageModelCheck: bool
maxHeuristicScore: float
maxModelScore: float
maxVectorScore: float
def to_camel(string: str) -> str:
string_split = string.split("_")
return string_split[0] + "".join(word.capitalize() for word in string_split[1:])


class DetectApiSuccessResponse(BaseModel):
heuristicScore: float
modelScore: float
vectorScore: Dict[str, float]
runHeuristicCheck: bool
runVectorCheck: bool
runLanguageModelCheck: bool
maxHeuristicScore: float
maxModelScore: float
maxVectorScore: float
injectionDetected: bool
class RebuffBaseModel(BaseModel):
class Config:
alias_generator = to_camel
populate_by_name = True


class ApiFailureResponse(BaseModel):
error: str
message: str
class TacticName(str, Enum):
HEURISTIC = "heuristic"
"""
A series of heuristics are used to determine whether the input is prompt injection.
"""

LANGUAGE_MODEL = "language_model"
"""
A language model is asked if the input appears to be prompt injection.
"""

VECTOR_DB = "vector_db"
"""
A vector database of known prompt injection attacks is queried for similarity.
"""


class TacticOverride(RebuffBaseModel):
"""
Override settings for a specific tactic.
"""

name: TacticName
"""
The name of the tactic to override.
"""

threshold: Optional[float] = None
"""
The threshold to use for this tactic. If the score is above this threshold, the tactic will be considered detected.
If not specified, the default threshold for the tactic will be used.
"""

run: Optional[bool] = True
"""
Whether to run this tactic. Defaults to true if not specified.
"""


class DetectRequest(RebuffBaseModel):
"""
Request to detect prompt injection.
"""

user_input: str
"""
The user input to check for prompt injection.
"""

user_input_base64: Optional[str] = None
"""
The base64-encoded user input. If this is specified, the user input will be ignored.
"""

tactic_overrides: Optional[List[TacticOverride]] = None
"""
Any tactics to change behavior for. If any tactic is not specified, the default threshold for that tactic will be used.
"""


class TacticResult(RebuffBaseModel):
"""
Result of a tactic execution.
"""

name: str
"""
The name of the tactic.
"""

score: float
"""
The score for the tactic. This is a number between 0 and 1. The closer to 1, the more likely that this is a prompt injection attempt.
"""

detected: bool
"""
Whether this tactic evaluated the input as a prompt injection attempt.
"""

threshold: float
"""
The threshold used for this tactic. If the score is above this threshold, the tactic will be considered detected.
"""

additional_fields: Dict[str, Any]
"""
Some tactics return additional fields:
* "vector_db":
- "countOverMaxVectorScore" (int): The number of different vectors whose similarity score is above the
threshold.
"""


class DetectResponse(RebuffBaseModel):
"""
Response from a prompt injection detection request.
"""

injection_detected: bool
"""
Whether prompt injection was detected.
"""

tactic_results: List[TacticResult]
"""
The result for each tactic that was executed.
"""


class ApiFailureResponse(Exception):
def __init__(self, error: str, message: str):
super().__init__(f"Error: {error}, Message: {message}")
self.error = error
self.message = message


class Rebuff:
def __init__(self, api_token: str, api_url: str = "https://playground.rebuff.ai"):
def __init__(
self, api_token: str, api_url: str = "https://www.rebuff.ai/playground"
):
self.api_token = api_token
self.api_url = api_url
self._headers = {
Expand All @@ -46,63 +148,50 @@ def __init__(self, api_token: str, api_url: str = "https://playground.rebuff.ai"
def detect_injection(
self,
user_input: str,
max_heuristic_score: float = 0.75,
max_vector_score: float = 0.90,
max_model_score: float = 0.9,
check_heuristic: bool = True,
check_vector: bool = True,
check_llm: bool = True,
) -> Union[DetectApiSuccessResponse, ApiFailureResponse]:
tactic_overrides: Optional[List[TacticOverride]] = None,
) -> DetectResponse:
"""
Detects if the given user input contains an injection attempt.
Args:
user_input (str): The user input to be checked for injection.
max_heuristic_score (float, optional): The maximum heuristic score allowed. Defaults to 0.75.
max_vector_score (float, optional): The maximum vector score allowed. Defaults to 0.90.
max_model_score (float, optional): The maximum model (LLM) score allowed. Defaults to 0.9.
check_heuristic (bool, optional): Whether to run the heuristic check. Defaults to True.
check_vector (bool, optional): Whether to run the vector check. Defaults to True.
check_llm (bool, optional): Whether to run the language model check. Defaults to True.
tactic_overrides (Optional[List[TacticOverride]], optional): A list of tactics to override.
If a tactic is not specified in this list, the default threshold for that tactic will be used.
Returns:
Tuple[Union[DetectApiSuccessResponse, ApiFailureResponse], bool]: A tuple containing the detection
metrics and a boolean indicating if an injection was detected.
DetectResponse: An object containing the detection metrics and a boolean indicating if an injection was
detected.
Example:
>>> from rebuff import Rebuff, TacticOverride, TacticName
>>> rb = Rebuff(api_token='your_api_token')
>>> user_input = "Your user input here"
>>> tactic_overrides = [
... TacticOverride(name=TacticName.HEURISTIC, threshold=0.6),
... TacticOverride(name=TacticName.LANGUAGE_MODEL, run=False),
... ]
>>> response = rb.detect_injection(user_input, tactic_overrides)
"""
request_data = DetectApiRequest(
userInput=user_input,
userInputBase64=encode_string(user_input),
runHeuristicCheck=check_heuristic,
runVectorCheck=check_vector,
runLanguageModelCheck=check_llm,
maxVectorScore=max_vector_score,
maxModelScore=max_model_score,
maxHeuristicScore=max_heuristic_score,
request_data = DetectRequest(
user_input=user_input,
user_input_base64=encode_string(user_input),
tactic_overrides=tactic_overrides,
)

response = requests.post(
f"{self.api_url}/api/detect",
json=request_data.dict(),
json=request_data.model_dump(mode="json", by_alias=True, exclude_none=True),
headers=self._headers,
)

response.raise_for_status()

response_json = response.json()
success_response = DetectApiSuccessResponse.parse_obj(response_json)

if (
success_response.heuristicScore > max_heuristic_score
or success_response.modelScore > max_model_score
or success_response.vectorScore["topScore"] > max_vector_score
):
# Injection detected
success_response.injectionDetected = True
return success_response
else:
# No injection detected
success_response.injectionDetected = False
return success_response
if "error" in response_json:
raise ApiFailureResponse(
response_json["error"],
response_json.get("message", "No message provided"),
)
response.raise_for_status()
return DetectResponse.model_validate(response_json)

@staticmethod
def generate_canary_word(length: int = 8) -> str:
Expand Down
Loading

0 comments on commit 8f89671

Please sign in to comment.