Edit validatorplus.py — Guardrail Manager

Python source — Ctrl+S to save
"""ValidatorPlus — comprehensive multi-check content safety validator for RavRic Solutions.

Combines prompt injection detection, policy compliance, and content safety into a single
configurable validator. Designed for production AI pipelines needing multiple risk checks.
"""
import re
from typing import Any, Dict, Optional
from guardrails.validator_base import (
    FailResult,
    PassResult,
    ValidationResult,
    Validator,
    register_validator,
)

_INJECTION_PATTERNS = [
    (re.compile(r"ignore\s+(all\s+)?(previous|prior|above|earlier)\s+(instructions?|prompts?|context)", re.I), "override_ignore"),
    (re.compile(r"disregard\s+(all\s+)?(previous|prior|above|earlier)", re.I), "override_disregard"),
    (re.compile(r"forget\s+(everything|all|the above|prior)", re.I), "override_forget"),
    (re.compile(r"\byou\s+are\s+now\b.{0,40}(assistant|ai|model|bot|agent|system)\b", re.I), "role_hijack"),
    (re.compile(r"\bact\s+as\b.{0,20}(assistant|ai|model|bot|agent|hacker)", re.I), "role_act_as"),
    (re.compile(r"\bpretend\s+(you\s+are|to\s+be)\b", re.I), "role_pretend"),
    (re.compile(r"\[SYSTEM\]|\[INST\]|<\|system\|>|<<SYS>>", re.I), "system_tag_injection"),
    (re.compile(r"(reveal|show|print|expose|leak)\s+(your\s+)?(system\s+)?(prompt|instructions?)", re.I), "exfil_system_prompt"),
    (re.compile(r"\b(jailbreak|jail\s+break)\b", re.I), "jailbreak_keyword"),
]

_TOXIC_PATTERNS = [
    (re.compile(r"\b(kill|murder|assassinate)\s+(yourself|himself|herself|myself)\b", re.I), "self_harm"),
    (re.compile(r"\b(how\s+to\s+make|synthesize|manufacture)\s+(bomb|explosive|poison|drug)\b", re.I), "dangerous_instructions"),
]

_PII_PATTERNS = [
    (re.compile(r"\b\d{3}[-.\s]\d{2}[-.\s]\d{4}\b"), "ssn"),
    (re.compile(r"\b(?:\d[ -]?){13,16}\b"), "credit_card"),
]

_HIGH_RISK = {"self_harm", "dangerous_instructions", "jailbreak_keyword", "system_tag_injection"}


@register_validator(name="rrs/validator_plus", data_type="string")
class ValidatorPlus(Validator):
    """Multi-check: prompt injection + toxicity + optional PII.

    Args:
        check_injection: Enable prompt injection detection (default: True)
        check_toxicity:  Enable harmful content detection (default: True)
        check_pii:       Enable PII detection (default: False)
        on_fail:         Action on failure
    """

    def __init__(self, check_injection=True, check_toxicity=True, check_pii=False, on_fail=None, **kwargs):
        super().__init__(on_fail=on_fail, **kwargs)
        self._check_injection = check_injection
        self._check_toxicity = check_toxicity
        self._check_pii = check_pii

    def validate(self, value, metadata={}):
        if not isinstance(value, str) or not value.strip():
            return PassResult()

        violations = []

        if self._check_injection:
            for pattern, name in _INJECTION_PATTERNS:
                if pattern.search(value):
                    violations.append(name)

        if self._check_toxicity:
            for pattern, name in _TOXIC_PATTERNS:
                if pattern.search(value):
                    violations.append(name)

        if self._check_pii:
            for pattern, name in _PII_PATTERNS:
                if pattern.search(value):
                    violations.append(name)

        if not violations:
            return PassResult()

        high = [v for v in violations if v in _HIGH_RISK]
        severity = "high-risk" if high else "medium-risk"
        flagged = high if high else violations

        return FailResult(
            error_message=f"ValidatorPlus blocked {severity} content: {', '.join(flagged)}",
            fix_value="[CONTENT BLOCKED by ValidatorPlus]",
        )