Edit prompt_injection_validator.py

Python source — Ctrl+S to save
"""Custom prompt injection validator for Guardrails AI.

Pattern-based detection that doesn't require a Hub token or external LLM.
Covers the OWASP Top 10 LLM 2025 #1 vulnerability: indirect prompt injection.
"""
import re
from typing import Any, Callable, Dict, Optional
from guardrails.validator_base import (
    FailResult,
    PassResult,
    ValidationResult,
    Validator,
    register_validator,
)


# Canonical injection patterns — any match in the content is suspicious
_INJECTION_PATTERNS: list[tuple[re.Pattern, str]] = [
    # Classic override attempts
    (re.compile(r"ignore\s+(all\s+)?(previous|prior|above|earlier)\s+(instructions?|prompts?|context)", re.I), "override_ignore"),
    (re.compile(r"disregard\s+(all\s+)?(previous|prior|above|earlier)", re.I), "override_disregard"),
    (re.compile(r"forget\s+(everything|all|the above|prior)", re.I), "override_forget"),
    (re.compile(r"(do not|don't)\s+(follow|obey|use)\s+(your|the|these)\s+(previous|prior|original|system)\s*(instructions?|prompt|rules?|guidelines?)?", re.I), "override_do_not_follow"),

    # Role/identity hijacking
    (re.compile(r"\byou\s+are\s+now\b.{0,40}(an?\s+\w+\s+)?(assistant|ai|model|bot|agent|system)\b", re.I), "role_hijack"),
    (re.compile(r"\bact\s+as\b.{0,20}(an?\s+\w+\s+)?(assistant|ai|model|bot|agent|expert|hacker|attacker)", re.I), "role_act_as"),
    (re.compile(r"\bpretend\s+(you\s+are|to\s+be)\b", re.I), "role_pretend"),
    (re.compile(r"\byour\s+(new\s+)?(role|persona|identity|instructions?|purpose|goal)\s+is\b", re.I), "role_new_role"),
    (re.compile(r"\bswitch\s+(to\s+)?DAN\s+mode\b", re.I), "jailbreak_dan"),
    (re.compile(r"\b(jailbreak|jail\s+break)\b", re.I), "jailbreak_keyword"),

    # System prompt injection
    (re.compile(r"\[SYSTEM\]|\[INST\]|<\|system\|>|<<SYS>>|\[\/INST\]", re.I), "system_tag_injection"),
    (re.compile(r"###\s*(System|New\s+Instruction|Override)", re.I), "system_heading_injection"),
    (re.compile(r"<system_prompt>|</system_prompt>", re.I), "system_prompt_tag"),

    # Exfiltration attempts
    (re.compile(r"(reveal|show|print|output|expose|leak|display)\s+(your\s+)?(system\s+)?(prompt|instructions?|context|training)", re.I), "exfil_system_prompt"),
    (re.compile(r"what\s+(are|were|is)\s+your\s+(original\s+)?(instructions?|system\s+prompt|directives?)", re.I), "exfil_instructions"),

    # Indirect injection via web/email content
    (re.compile(r"IMPORTANT\s*:\s*ARIA\s*(must|should|will)\s+", re.I), "targeted_aria"),
    (re.compile(r"(send|forward|email|reply)\s+(all|everything|this|the)\s+(to|at)\s+\S+@\S+", re.I), "exfil_email"),
    (re.compile(r"(execute|run|call)\s+(this\s+)?(command|script|code|function)\s*:", re.I), "exec_command"),

    # Token smuggling / prompt boundary manipulation
    (re.compile(r"```(system|inst|override)", re.I), "code_block_injection"),
    (re.compile(r"\n{5,}", re.I), "whitespace_flood"),  # massive newline padding to push context
]

# Risk thresholds
_HIGH_RISK_TYPES = {
    "targeted_aria", "exfil_email", "exec_command",
    "jailbreak_dan", "system_tag_injection", "system_prompt_tag",
}


@register_validator(name="aria/prompt_injection_detector", data_type="string")
class PromptInjectionDetector(Validator):
    """Detect prompt injection attempts in text from untrusted sources.

    Flags patterns that attempt to override system instructions, hijack the
    AI's role, or exfiltrate system prompts. Designed for content fetched
    from the web, email bodies, or other external sources processed by ARIA.

    Args:
        threshold: Number of pattern matches before failing (default: 1 for
                   high-risk patterns, 2 for others)
        on_fail: Action on failure — "exception", "fix", or "refrain"
    """

    def __init__(self, threshold: int = 1, on_fail: Optional[str] = "exception", **kwargs):
        super().__init__(on_fail=on_fail, **kwargs)
        self._threshold = threshold

    def validate(self, value: Any, metadata: Dict = {}) -> ValidationResult:
        if not isinstance(value, str) or not value.strip():
            return PassResult()

        matches = []
        for pattern, pattern_type in _INJECTION_PATTERNS:
            if pattern.search(value):
                matches.append(pattern_type)

        if not matches:
            return PassResult()

        # Any high-risk match → always fail
        high_risk = [m for m in matches if m in _HIGH_RISK_TYPES]
        if high_risk:
            return FailResult(
                error_message=(
                    f"Prompt injection detected (high-risk): {', '.join(high_risk)}. "
                    "Content from untrusted source blocked."
                ),
                fix_value="[CONTENT BLOCKED: potential prompt injection detected]",
            )

        # Multiple ordinary matches → fail
        if len(matches) >= self._threshold:
            return FailResult(
                error_message=(
                    f"Prompt injection detected: {', '.join(matches)}. "
                    "Content from untrusted source blocked."
                ),
                fix_value="[CONTENT BLOCKED: potential prompt injection detected]",
            )

        return PassResult()