๐ก๏ธ Guardrail
Manager
MAIN
๐
Dashboard
๐ก๏ธ
Guards
๐ง
Custom Validators
๐ฆ
Hub Validators
๐งช
Test Guard
CONFIG
โ๏ธ
Settings
prompt_injection_validator.py
Edit Python validator file
โ Back to Validators
Python source โ Ctrl+S to save
๐พ Save
๐๏ธ Delete
"""Custom prompt injection validator for Guardrails AI. Pattern-based detection that doesn't require a Hub token or external LLM. Covers the OWASP Top 10 LLM 2025 #1 vulnerability: indirect prompt injection. """ import re from typing import Any, Callable, Dict, Optional from guardrails.validator_base import ( FailResult, PassResult, ValidationResult, Validator, register_validator, ) # Canonical injection patterns โ any match in the content is suspicious _INJECTION_PATTERNS: list[tuple[re.Pattern, str]] = [ # Classic override attempts (re.compile(r"ignore\s+(all\s+)?(previous|prior|above|earlier)\s+(instructions?|prompts?|context)", re.I), "override_ignore"), (re.compile(r"disregard\s+(all\s+)?(previous|prior|above|earlier)", re.I), "override_disregard"), (re.compile(r"forget\s+(everything|all|the above|prior)", re.I), "override_forget"), (re.compile(r"(do not|don't)\s+(follow|obey|use)\s+(your|the|these)\s+(previous|prior|original|system)\s*(instructions?|prompt|rules?|guidelines?)?", re.I), "override_do_not_follow"), # Role/identity hijacking (re.compile(r"\byou\s+are\s+now\b.{0,40}(an?\s+\w+\s+)?(assistant|ai|model|bot|agent|system)\b", re.I), "role_hijack"), (re.compile(r"\bact\s+as\b.{0,20}(an?\s+\w+\s+)?(assistant|ai|model|bot|agent|expert|hacker|attacker)", re.I), "role_act_as"), (re.compile(r"\bpretend\s+(you\s+are|to\s+be)\b", re.I), "role_pretend"), (re.compile(r"\byour\s+(new\s+)?(role|persona|identity|instructions?|purpose|goal)\s+is\b", re.I), "role_new_role"), (re.compile(r"\bswitch\s+(to\s+)?DAN\s+mode\b", re.I), "jailbreak_dan"), (re.compile(r"\b(jailbreak|jail\s+break)\b", re.I), "jailbreak_keyword"), # System prompt injection (re.compile(r"\[SYSTEM\]|\[INST\]|<\|system\|>|<<SYS>>|\[\/INST\]", re.I), "system_tag_injection"), (re.compile(r"###\s*(System|New\s+Instruction|Override)", re.I), "system_heading_injection"), (re.compile(r"<system_prompt>|</system_prompt>", re.I), "system_prompt_tag"), # Exfiltration attempts (re.compile(r"(reveal|show|print|output|expose|leak|display)\s+(your\s+)?(system\s+)?(prompt|instructions?|context|training)", re.I), "exfil_system_prompt"), (re.compile(r"what\s+(are|were|is)\s+your\s+(original\s+)?(instructions?|system\s+prompt|directives?)", re.I), "exfil_instructions"), # Indirect injection via web/email content (re.compile(r"IMPORTANT\s*:\s*ARIA\s*(must|should|will)\s+", re.I), "targeted_aria"), (re.compile(r"(send|forward|email|reply)\s+(all|everything|this|the)\s+(to|at)\s+\S+@\S+", re.I), "exfil_email"), (re.compile(r"(execute|run|call)\s+(this\s+)?(command|script|code|function)\s*:", re.I), "exec_command"), # Token smuggling / prompt boundary manipulation (re.compile(r"```(system|inst|override)", re.I), "code_block_injection"), (re.compile(r"\n{5,}", re.I), "whitespace_flood"), # massive newline padding to push context ] # Risk thresholds _HIGH_RISK_TYPES = { "targeted_aria", "exfil_email", "exec_command", "jailbreak_dan", "system_tag_injection", "system_prompt_tag", } @register_validator(name="aria/prompt_injection_detector", data_type="string") class PromptInjectionDetector(Validator): """Detect prompt injection attempts in text from untrusted sources. Flags patterns that attempt to override system instructions, hijack the AI's role, or exfiltrate system prompts. Designed for content fetched from the web, email bodies, or other external sources processed by ARIA. Args: threshold: Number of pattern matches before failing (default: 1 for high-risk patterns, 2 for others) on_fail: Action on failure โ "exception", "fix", or "refrain" """ def __init__(self, threshold: int = 1, on_fail: Optional[str] = "exception", **kwargs): super().__init__(on_fail=on_fail, **kwargs) self._threshold = threshold def validate(self, value: Any, metadata: Dict = {}) -> ValidationResult: if not isinstance(value, str) or not value.strip(): return PassResult() matches = [] for pattern, pattern_type in _INJECTION_PATTERNS: if pattern.search(value): matches.append(pattern_type) if not matches: return PassResult() # Any high-risk match โ always fail high_risk = [m for m in matches if m in _HIGH_RISK_TYPES] if high_risk: return FailResult( error_message=( f"Prompt injection detected (high-risk): {', '.join(high_risk)}. " "Content from untrusted source blocked." ), fix_value="[CONTENT BLOCKED: potential prompt injection detected]", ) # Multiple ordinary matches โ fail if len(matches) >= self._threshold: return FailResult( error_message=( f"Prompt injection detected: {', '.join(matches)}. " "Content from untrusted source blocked." ), fix_value="[CONTENT BLOCKED: potential prompt injection detected]", ) return PassResult()
๐พ Save File