Skip to content

🛡️ Data Privacy & Redaction

Privacy protection and sensitive data redaction functions.

🎯 Overview

Data privacy features provide automatic PII detection and redaction for secure data handling.

📦 Functions

RedactionEngine

datason.RedactionEngine(redact_fields: Optional[List[str]] = None, redact_patterns: Optional[List[str]] = None, redact_large_objects: bool = False, large_object_threshold: int = 10 * 1024 * 1024, redaction_replacement: str = '<REDACTED>', include_redaction_summary: bool = False, audit_trail: bool = False)

Core redaction engine for sensitive data protection.

Initialize the redaction engine.

Parameters:

Name Type Description Default
redact_fields Optional[List[str]]

List of field patterns to redact (e.g., ["password", "*.secret"])

None
redact_patterns Optional[List[str]]

List of regex patterns to redact

None
redact_large_objects bool

Whether to redact objects larger than threshold

False
large_object_threshold int

Size threshold for large object redaction (bytes)

10 * 1024 * 1024
redaction_replacement str

Replacement text for redacted content

'<REDACTED>'
include_redaction_summary bool

Whether to include redaction summary

False
audit_trail bool

Whether to maintain audit trail for compliance

False
Source code in datason/redaction.py
def __init__(
    self,
    redact_fields: Optional[List[str]] = None,
    redact_patterns: Optional[List[str]] = None,
    redact_large_objects: bool = False,
    large_object_threshold: int = 10 * 1024 * 1024,  # 10MB
    redaction_replacement: str = "<REDACTED>",
    include_redaction_summary: bool = False,
    audit_trail: bool = False,
):
    """Initialize the redaction engine.

    Args:
        redact_fields: List of field patterns to redact (e.g., ["password", "*.secret"])
        redact_patterns: List of regex patterns to redact
        redact_large_objects: Whether to redact objects larger than threshold
        large_object_threshold: Size threshold for large object redaction (bytes)
        redaction_replacement: Replacement text for redacted content
        include_redaction_summary: Whether to include redaction summary
        audit_trail: Whether to maintain audit trail for compliance
    """
    self.redact_fields = redact_fields or []
    self.redact_patterns = redact_patterns or []
    self.redact_large_objects = redact_large_objects
    self.large_object_threshold = large_object_threshold
    self.redaction_replacement = redaction_replacement
    self.include_redaction_summary = include_redaction_summary
    self.audit_trail = audit_trail

    # Compile regex patterns for performance
    self._compiled_patterns: List[Pattern] = []
    for pattern in self.redact_patterns:
        try:
            self._compiled_patterns.append(re.compile(pattern, re.IGNORECASE))
        except re.error as e:
            warnings.warn(f"Invalid regex pattern '{pattern}': {e}", stacklevel=2)

    # Audit trail storage
    self._audit_entries: List[RedactionAuditEntry] = []

    # Redaction summary
    self._summary = RedactionSummary()

get_audit_trail() -> Optional[List[Dict[str, Any]]]

Get audit trail if enabled.

Source code in datason/redaction.py
def get_audit_trail(self) -> Optional[List[Dict[str, Any]]]:
    """Get audit trail if enabled."""
    if not self.audit_trail:
        return None

    return [
        {
            "timestamp": entry.timestamp,
            "redaction_type": entry.redaction_type,
            "target": entry.target,
            "original_type": entry.original_type,
            "replacement": entry.replacement,
            "context": entry.context,
        }
        for entry in self._audit_entries
    ]

get_redaction_summary() -> Optional[Dict[str, Any]]

Get redaction summary if enabled.

Source code in datason/redaction.py
def get_redaction_summary(self) -> Optional[Dict[str, Any]]:
    """Get redaction summary if enabled."""
    if not self.include_redaction_summary:
        return None

    return {
        "redaction_summary": {
            "fields_redacted": list(set(self._summary.fields_redacted)),
            "patterns_matched": list(set(self._summary.patterns_matched)),
            "large_objects_redacted": list(set(self._summary.large_objects_redacted)),
            "total_redactions": self._summary.total_redactions,
            "redaction_timestamp": self._summary.redaction_timestamp,
        }
    }

process_object(obj: Any, field_path: str = '', _visited: Optional[Set[int]] = None) -> Any

Process an object for redaction.

Parameters:

Name Type Description Default
obj Any

Object to process

required
field_path str

Current field path

''
_visited Optional[Set[int]]

Set of visited object IDs (for circular reference detection)

None

Returns:

Type Description
Any

Processed object with redactions applied

Source code in datason/redaction.py
def process_object(self, obj: Any, field_path: str = "", _visited: Optional[Set[int]] = None) -> Any:
    """Process an object for redaction.

    Args:
        obj: Object to process
        field_path: Current field path
        _visited: Set of visited object IDs (for circular reference detection)

    Returns:
        Processed object with redactions applied
    """
    if _visited is None:
        _visited = set()

    # Circular reference detection
    obj_id = id(obj)
    if obj_id in _visited:
        return "<CIRCULAR_REFERENCE>"

    # Check for large object redaction first
    if self.should_redact_large_object(obj):
        return self.redact_large_object(obj, field_path)

    # For mutable objects, track in visited set
    if isinstance(obj, (dict, list, set)):
        _visited.add(obj_id)

    try:
        if isinstance(obj, dict):
            return self._process_dict(obj, field_path, _visited)
        elif isinstance(obj, (list, tuple)):
            return self._process_list(obj, field_path, _visited)
        elif isinstance(obj, str):
            # Apply pattern redaction to strings
            redacted_text, _ = self.redact_text(obj, field_path)
            return redacted_text
        else:
            # For other types, return as-is
            return obj
    finally:
        # Clean up visited set
        if isinstance(obj, (dict, list, set)):
            _visited.discard(obj_id)

redact_field_value(value: Any, field_path: str) -> Any

Redact a field value.

Parameters:

Name Type Description Default
value Any

Value to redact

required
field_path str

Field path for audit trail

required

Returns:

Type Description
Any

Redacted value

Source code in datason/redaction.py
def redact_field_value(self, value: Any, field_path: str) -> Any:
    """Redact a field value.

    Args:
        value: Value to redact
        field_path: Field path for audit trail

    Returns:
        Redacted value
    """
    if self.audit_trail:
        self._add_audit_entry(
            redaction_type="field",
            target=field_path,
            original_type=type(value).__name__,
            replacement=self.redaction_replacement,
            context="Field pattern match",
        )

    self._summary.fields_redacted.append(field_path)
    self._summary.total_redactions += 1

    return self.redaction_replacement

redact_large_object(obj: Any, field_path: str = '') -> Any

Redact a large object.

Parameters:

Name Type Description Default
obj Any

Object to redact

required
field_path str

Field path for audit trail

''

Returns:

Type Description
Any

Redacted representation

Source code in datason/redaction.py
def redact_large_object(self, obj: Any, field_path: str = "") -> Any:
    """Redact a large object.

    Args:
        obj: Object to redact
        field_path: Field path for audit trail

    Returns:
        Redacted representation
    """
    size = sys.getsizeof(obj)
    obj_type = type(obj).__name__

    redacted = f"<LARGE_OBJECT_REDACTED: {obj_type}, ~{size:,} bytes>"

    if self.audit_trail:
        self._add_audit_entry(
            redaction_type="size",
            target=field_path or f"large_{obj_type}",
            original_type=obj_type,
            replacement=redacted,
            context=f"Object size: {size:,} bytes",
        )

    self._summary.large_objects_redacted.append(field_path or f"<{obj_type}>")
    self._summary.total_redactions += 1

    return redacted

redact_text(text: str, context: str = '') -> Tuple[str, bool]

Redact sensitive patterns from text.

Parameters:

Name Type Description Default
text str

Text to redact

required
context str

Context for audit trail

''

Returns:

Type Description
Tuple[str, bool]

Tuple of (redacted_text, was_redacted)

Source code in datason/redaction.py
def redact_text(self, text: str, context: str = "") -> Tuple[str, bool]:
    """Redact sensitive patterns from text.

    Args:
        text: Text to redact
        context: Context for audit trail

    Returns:
        Tuple of (redacted_text, was_redacted)
    """
    if not isinstance(text, str) or not self._compiled_patterns:
        return text, False

    redacted_text = text
    was_redacted = False

    for pattern in self._compiled_patterns:
        matches = pattern.findall(redacted_text)
        if matches:
            redacted_text = pattern.sub(self.redaction_replacement, redacted_text)
            was_redacted = True

            if self.audit_trail:
                for match in matches:
                    self._add_audit_entry(
                        redaction_type="pattern",
                        target=pattern.pattern,
                        original_type="string",
                        replacement=self.redaction_replacement,
                        context=context,
                    )

    if was_redacted:
        self._summary.patterns_matched.extend([p.pattern for p in self._compiled_patterns])
        self._summary.total_redactions += 1

    return redacted_text, was_redacted

should_redact_large_object(obj: Any) -> bool

Check if object should be redacted due to size.

Parameters:

Name Type Description Default
obj Any

Object to check

required

Returns:

Type Description
bool

True if object should be redacted due to size

Source code in datason/redaction.py
def should_redact_large_object(self, obj: Any) -> bool:
    """Check if object should be redacted due to size.

    Args:
        obj: Object to check

    Returns:
        True if object should be redacted due to size
    """
    if not self.redact_large_objects:
        return False

    try:
        # Estimate object size
        size = sys.getsizeof(obj)

        # For collections, estimate recursively (with limit)
        if isinstance(obj, (list, tuple)) and len(obj) > 0:
            # Sample first few items
            sample_size = min(10, len(obj))
            avg_item_size = sum(sys.getsizeof(obj[i]) for i in range(sample_size)) / sample_size
            size += int(avg_item_size * (len(obj) - sample_size))
        elif isinstance(obj, dict) and len(obj) > 0:
            # Sample first few items
            items = list(obj.items())
            sample_size = min(10, len(items))
            avg_item_size = sum(sys.getsizeof(k) + sys.getsizeof(v) for k, v in items[:sample_size]) / sample_size
            size += int(avg_item_size * (len(items) - sample_size))

        return size > self.large_object_threshold

    except (TypeError, AttributeError):
        # Can't determine size, assume it's not large
        return False

Pre-built Redaction Engines

datason.create_financial_redaction_engine() -> RedactionEngine

Create a redaction engine optimized for financial data.

Source code in datason/redaction.py
def create_financial_redaction_engine() -> RedactionEngine:
    """Create a redaction engine optimized for financial data."""
    return RedactionEngine(
        redact_fields=[
            "*.password",
            "*.secret",
            "*.key",
            "*.token",
            "*.ssn",
            "*.social_security",
            "*.tax_id",
            "*.account_number",
            "*.routing_number",
            "*.credit_card",
            "*.card_number",
            "*.cvv",
            "*.pin",
        ],
        redact_patterns=[
            r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",  # Credit cards
            r"\b\d{3}-\d{2}-\d{4}\b",  # US SSN
            r"\b\d{9}\b",  # US Tax ID
            r"\b\d{10,12}\b",  # Account numbers
        ],
        redact_large_objects=True,
        large_object_threshold=5 * 1024 * 1024,  # 5MB for financial data
        include_redaction_summary=True,
        audit_trail=True,
    )

datason.create_healthcare_redaction_engine() -> RedactionEngine

Create a redaction engine optimized for healthcare data.

Source code in datason/redaction.py
def create_healthcare_redaction_engine() -> RedactionEngine:
    """Create a redaction engine optimized for healthcare data."""
    return RedactionEngine(
        redact_fields=[
            "*.patient_id",
            "*.medical_record",
            "*.ssn",
            "*.phone",
            "*.email",
            "*.address",
            "*.name",
            "*.dob",
            "*.birth_date",
            "*.diagnosis",
        ],
        redact_patterns=[
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email
            r"\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",  # Phone
            r"\b\d{3}-\d{2}-\d{4}\b",  # SSN
        ],
        redact_large_objects=True,
        include_redaction_summary=True,
        audit_trail=True,
    )

datason.create_minimal_redaction_engine() -> RedactionEngine

Create a minimal redaction engine for basic privacy protection.

Source code in datason/redaction.py
def create_minimal_redaction_engine() -> RedactionEngine:
    """Create a minimal redaction engine for basic privacy protection."""
    return RedactionEngine(
        redact_fields=["*.password", "*.secret", "*.key", "*.token"],
        redact_patterns=[
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email addresses
        ],
        redact_large_objects=False,
        include_redaction_summary=False,
        audit_trail=False,
    )