Skip to content

📚 Complete API Reference

Auto-generated documentation for all datason functions, classes, and constants.

🚀 Modern API Functions

Serialization Functions

datason.dump(obj: Any, fp: Any, **kwargs: Any) -> None

Enhanced file serialization (DataSON's smart default).

This saves enhanced DataSON serialized data to a file using save_ml(). For stdlib json.dump() compatibility, use datason.json.dump() or dump_json().

Parameters:

Name Type Description Default
obj Any

Object to serialize

required
fp Any

File-like object or file path to write to

required
**kwargs Any

DataSON configuration options

{}

Returns:

Type Description
None

None (writes to file)

Example

with open('data.json', 'w') as f: ... dump(data, f) # Enhanced serialization with smart features

For JSON compatibility:

import datason.json as json with open('data.json', 'w') as f: ... json.dump(data, f) # Exact json.dump() behavior

Source code in datason/api.py
def dump(obj: Any, fp: Any, **kwargs: Any) -> None:
    """Enhanced file serialization (DataSON's smart default).

    This saves enhanced DataSON serialized data to a file using save_ml().
    For stdlib json.dump() compatibility, use datason.json.dump() or dump_json().

    Args:
        obj: Object to serialize
        fp: File-like object or file path to write to
        **kwargs: DataSON configuration options

    Returns:
        None (writes to file)

    Example:
        >>> with open('data.json', 'w') as f:
        ...     dump(data, f)  # Enhanced serialization with smart features

        >>> # For JSON compatibility:
        >>> import datason.json as json
        >>> with open('data.json', 'w') as f:
        ...     json.dump(data, f)  # Exact json.dump() behavior
    """
    # Use enhanced file saving (supports both file objects and paths)
    if hasattr(fp, "write"):
        # File-like object: serialize to enhanced format and write
        import json

        serialized = _serialize_core(obj, **kwargs)
        json.dump(serialized, fp)
    else:
        # File path: use save_ml for enhanced features
        save_ml(obj, fp, **kwargs)

datason.dump_ml(obj: Any, **kwargs: Any) -> Any

ML-optimized serialization for models, tensors, and ML objects.

Automatically configures optimal settings for machine learning objects including NumPy arrays, PyTorch tensors, scikit-learn models, etc.

Parameters:

Name Type Description Default
obj Any

ML object to serialize

required
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

Serialized ML object optimized for reconstruction

Example

model = sklearn.ensemble.RandomForestClassifier() serialized = dump_ml(model)

Optimized for ML round-trip fidelity

Source code in datason/api.py
def dump_ml(obj: Any, **kwargs: Any) -> Any:
    """ML-optimized serialization for models, tensors, and ML objects.

    Automatically configures optimal settings for machine learning objects
    including NumPy arrays, PyTorch tensors, scikit-learn models, etc.

    Args:
        obj: ML object to serialize
        **kwargs: Additional configuration options

    Returns:
        Serialized ML object optimized for reconstruction

    Example:
        >>> model = sklearn.ensemble.RandomForestClassifier()
        >>> serialized = dump_ml(model)
        >>> # Optimized for ML round-trip fidelity
    """
    # Create a copy of ML-optimized config to avoid modifying shared instances
    base_config = get_ml_config()
    from dataclasses import replace

    config = replace(base_config, **kwargs)

    # Directly call serialize - serializer handles circular references properly
    from .core_new import serialize

    return serialize(obj, config=config)

datason.dump_api(obj: Any, **kwargs: Any) -> Any

API-safe serialization for web responses and APIs.

Produces clean, predictable JSON suitable for API responses. Handles edge cases gracefully and ensures consistent output format.

Parameters:

Name Type Description Default
obj Any

Object to serialize for API response

required
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

API-safe serialized object

Example

@app.route('/api/data') def get_data(): return dump_api(complex_data_structure)

Source code in datason/api.py
def dump_api(obj: Any, **kwargs: Any) -> Any:
    """API-safe serialization for web responses and APIs.

    Produces clean, predictable JSON suitable for API responses.
    Handles edge cases gracefully and ensures consistent output format.

    Args:
        obj: Object to serialize for API response
        **kwargs: Additional configuration options

    Returns:
        API-safe serialized object

    Example:
        >>> @app.route('/api/data')
        >>> def get_data():
        >>>     return dump_api(complex_data_structure)
    """
    # Create a copy of API-optimized config to avoid modifying shared instances
    base_config = get_api_config()
    from dataclasses import replace

    config = replace(base_config, **kwargs)

    # Directly call serialize - serializer handles circular references properly
    from .core_new import serialize

    return serialize(obj, config=config)

datason.dump_secure(obj: Any, *, redact_pii: bool = True, redact_fields: Optional[List[str]] = None, redact_patterns: Optional[List[str]] = None, **kwargs: Any) -> Any

Security-focused serialization with PII redaction.

Automatically redacts sensitive information like credit cards, SSNs, emails, and common secret fields.

Parameters:

Name Type Description Default
obj Any

Object to serialize securely

required
redact_pii bool

Enable automatic PII pattern detection

True
redact_fields Optional[List[str]]

Additional field names to redact

None
redact_patterns Optional[List[str]]

Additional regex patterns to redact

None
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

Serialized object with sensitive data redacted

Example

user_data = {"name": "John", "ssn": "123-45-6789"} safe_data = dump_secure(user_data)

SSN will be redacted:

Source code in datason/api.py
def dump_secure(
    obj: Any,
    *,
    redact_pii: bool = True,
    redact_fields: Optional[List[str]] = None,
    redact_patterns: Optional[List[str]] = None,
    **kwargs: Any,
) -> Any:
    """Security-focused serialization with PII redaction.

    Automatically redacts sensitive information like credit cards,
    SSNs, emails, and common secret fields.

    Args:
        obj: Object to serialize securely
        redact_pii: Enable automatic PII pattern detection
        redact_fields: Additional field names to redact
        redact_patterns: Additional regex patterns to redact
        **kwargs: Additional configuration options

    Returns:
        Serialized object with sensitive data redacted

    Example:
        >>> user_data = {"name": "John", "ssn": "123-45-6789"}
        >>> safe_data = dump_secure(user_data)
        >>> # SSN will be redacted: {"name": "John", "ssn": "[REDACTED]"}
    """
    # Create secure config with redaction settings
    patterns = []
    fields = []

    if redact_pii:
        patterns.extend(
            [
                r"\b\d{4}-\d{4}-\d{4}-\d{4}\b",  # Credit cards with dashes
                r"\b\d{16}\b",  # Credit cards without dashes
                r"\b\d{3}-\d{2}-\d{4}\b",  # SSN
                r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email
            ]
        )
        fields.extend(["password", "api_key", "secret", "token", "ssn", "credit_card"])

    if redact_patterns:
        patterns.extend(redact_patterns)
    if redact_fields:
        fields.extend(redact_fields)

    # Remove include_redaction_summary from kwargs if present to avoid duplicate
    kwargs_clean = {k: v for k, v in kwargs.items() if k != "include_redaction_summary"}

    config = SerializationConfig(
        redact_patterns=patterns,
        redact_fields=fields,
        include_redaction_summary=True,
        # Keep normal max_depth to maintain security
        **kwargs_clean,
    )

    # Directly call serialize - serializer handles circular references properly
    from .core_new import serialize

    return serialize(obj, config=config)

datason.dump_fast(obj: Any, **kwargs: Any) -> Any

Performance-optimized serialization.

Optimized for speed with minimal type checking and validation. Use when you need maximum performance and can accept some trade-offs in type fidelity.

Parameters:

Name Type Description Default
obj Any

Object to serialize quickly

required
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

Serialized object optimized for speed

Example

For high-throughput scenarios

result = dump_fast(large_dataset)

Source code in datason/api.py
def dump_fast(obj: Any, **kwargs: Any) -> Any:
    """Performance-optimized serialization.

    Optimized for speed with minimal type checking and validation.
    Use when you need maximum performance and can accept some trade-offs
    in type fidelity.

    Args:
        obj: Object to serialize quickly
        **kwargs: Additional configuration options

    Returns:
        Serialized object optimized for speed

    Example:
        >>> # For high-throughput scenarios
        >>> result = dump_fast(large_dataset)
    """
    config = get_performance_config()
    return serialize(obj, config=config)

datason.dump_chunked(obj: Any, *, chunk_size: int = 1000, **kwargs: Any) -> Any

Chunked serialization for large objects.

Breaks large objects into manageable chunks for memory efficiency and streaming processing.

Parameters:

Name Type Description Default
obj Any

Large object to serialize in chunks

required
chunk_size int

Size of each chunk

1000
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

ChunkedSerializationResult with metadata and chunks

Example

big_list = list(range(10000)) result = dump_chunked(big_list, chunk_size=1000)

Returns ChunkedSerializationResult with 10 chunks

Source code in datason/api.py
def dump_chunked(obj: Any, *, chunk_size: int = 1000, **kwargs: Any) -> Any:
    """Chunked serialization for large objects.

    Breaks large objects into manageable chunks for memory efficiency
    and streaming processing.

    Args:
        obj: Large object to serialize in chunks
        chunk_size: Size of each chunk
        **kwargs: Additional configuration options

    Returns:
        ChunkedSerializationResult with metadata and chunks

    Example:
        >>> big_list = list(range(10000))
        >>> result = dump_chunked(big_list, chunk_size=1000)
        >>> # Returns ChunkedSerializationResult with 10 chunks
    """
    return serialize_chunked(obj, chunk_size=chunk_size, **kwargs)

datason.stream_dump(file_path: str, **kwargs: Any) -> Any

Streaming serialization to file.

Efficiently serialize large datasets directly to file without loading everything into memory.

Parameters:

Name Type Description Default
file_path str

Path to output file

required
**kwargs Any

Additional configuration options

{}

Returns:

Type Description
Any

StreamingSerializer instance for continued operations

Example

with stream_dump("output.jsonl") as streamer: for item in large_dataset: streamer.write(item)

Source code in datason/api.py
def stream_dump(file_path: str, **kwargs: Any) -> Any:
    """Streaming serialization to file.

    Efficiently serialize large datasets directly to file without
    loading everything into memory.

    Args:
        file_path: Path to output file
        **kwargs: Additional configuration options

    Returns:
        StreamingSerializer instance for continued operations

    Example:
        >>> with stream_dump("output.jsonl") as streamer:
        >>>     for item in large_dataset:
        >>>         streamer.write(item)
    """
    return stream_serialize(file_path, **kwargs)

Deserialization Functions

datason.load_basic(data: Any, **kwargs: Any) -> Any

Basic deserialization using heuristics only.

Uses simple heuristics to reconstruct Python objects from serialized data. Fast but with limited type fidelity - suitable for exploration and non-critical applications.

Success rate: ~60-70% for complex objects Speed: Fastest Use case: Data exploration, simple objects

Parameters:

Name Type Description Default
data Any

Serialized data to deserialize

required
**kwargs Any

Additional options (parse_dates, parse_uuids, etc.)

{}

Returns:

Type Description
Any

Deserialized Python object

Example

serialized = {"numbers": [1, 2, 3], "text": "hello"} result = load_basic(serialized)

Works well for simple structures

Source code in datason/api.py
def load_basic(data: Any, **kwargs: Any) -> Any:
    """Basic deserialization using heuristics only.

    Uses simple heuristics to reconstruct Python objects from serialized data.
    Fast but with limited type fidelity - suitable for exploration and
    non-critical applications.

    Success rate: ~60-70% for complex objects
    Speed: Fastest
    Use case: Data exploration, simple objects

    Args:
        data: Serialized data to deserialize
        **kwargs: Additional options (parse_dates, parse_uuids, etc.)

    Returns:
        Deserialized Python object

    Example:
        >>> serialized = {"numbers": [1, 2, 3], "text": "hello"}
        >>> result = load_basic(serialized)
        >>> # Works well for simple structures
    """
    return deserialize(data, **kwargs)

datason.load_smart(data: Any, config: Optional[SerializationConfig] = None, **kwargs: Any) -> Any

Smart deserialization with auto-detection and heuristics.

Combines automatic type detection with heuristic fallbacks. Good balance of accuracy and performance for most use cases.

Success rate: ~80-90% for complex objects Speed: Moderate Use case: General purpose, production data processing

Parameters:

Name Type Description Default
data Any

Serialized data to deserialize

required
config Optional[SerializationConfig]

Configuration for deserialization behavior

None
**kwargs Any

Additional options

{}

Returns:

Type Description
Any

Deserialized Python object with improved type fidelity

Example

serialized = dump_api(complex_object) result = load_smart(serialized)

Better type reconstruction than load_basic

Source code in datason/api.py
def load_smart(data: Any, config: Optional[SerializationConfig] = None, **kwargs: Any) -> Any:
    """Smart deserialization with auto-detection and heuristics.

    Combines automatic type detection with heuristic fallbacks.
    Good balance of accuracy and performance for most use cases.

    Success rate: ~80-90% for complex objects
    Speed: Moderate
    Use case: General purpose, production data processing

    Args:
        data: Serialized data to deserialize
        config: Configuration for deserialization behavior
        **kwargs: Additional options

    Returns:
        Deserialized Python object with improved type fidelity

    Example:
        >>> serialized = dump_api(complex_object)
        >>> result = load_smart(serialized)
        >>> # Better type reconstruction than load_basic
    """
    if config is None:
        config = SerializationConfig(auto_detect_types=True)
    return deserialize_fast(data, config=config, **kwargs)

datason.load_perfect(data: Any, template: Any, **kwargs: Any) -> Any

Perfect deserialization using template matching.

Uses a template object to achieve 100% accurate reconstruction. Requires you to provide the structure/type information but guarantees perfect fidelity.

Success rate: 100% when template matches data Speed: Fast (direct template matching) Use case: Critical applications, ML model loading, exact reconstruction

Parameters:

Name Type Description Default
data Any

Serialized data to deserialize

required
template Any

Template object showing expected structure/types

required
**kwargs Any

Additional options

{}

Returns:

Type Description
Any

Perfectly reconstructed Python object matching template

Example

original = MyComplexClass(...) serialized = dump_ml(original) template = MyComplexClass.get_template() # or original itself result = load_perfect(serialized, template)

Guaranteed perfect reconstruction

Source code in datason/api.py
def load_perfect(data: Any, template: Any, **kwargs: Any) -> Any:
    """Perfect deserialization using template matching.

    Uses a template object to achieve 100% accurate reconstruction.
    Requires you to provide the structure/type information but
    guarantees perfect fidelity.

    Success rate: 100% when template matches data
    Speed: Fast (direct template matching)
    Use case: Critical applications, ML model loading, exact reconstruction

    Args:
        data: Serialized data to deserialize
        template: Template object showing expected structure/types
        **kwargs: Additional options

    Returns:
        Perfectly reconstructed Python object matching template

    Example:
        >>> original = MyComplexClass(...)
        >>> serialized = dump_ml(original)
        >>> template = MyComplexClass.get_template()  # or original itself
        >>> result = load_perfect(serialized, template)
        >>> # Guaranteed perfect reconstruction
    """
    return deserialize_with_template(data, template, **kwargs)

datason.load_typed(data: Any, config: Optional[SerializationConfig] = None, **kwargs: Any) -> Any

Metadata-based type reconstruction.

Uses embedded type metadata from serialization to reconstruct objects. Requires data was serialized with type information preserved.

Success rate: ~95% when metadata available Speed: Fast (direct metadata lookup) Use case: When you control both serialization and deserialization

Parameters:

Name Type Description Default
data Any

Serialized data with embedded type metadata

required
config Optional[SerializationConfig]

Configuration for type reconstruction

None
**kwargs Any

Additional options

{}

Returns:

Type Description
Any

Type-accurate deserialized Python object

Example

Works best with datason-serialized data

serialized = dump(original_object) # Preserves type info result = load_typed(serialized)

High fidelity reconstruction using embedded metadata

Source code in datason/api.py
def load_typed(data: Any, config: Optional[SerializationConfig] = None, **kwargs: Any) -> Any:
    """Metadata-based type reconstruction.

    Uses embedded type metadata from serialization to reconstruct objects.
    Requires data was serialized with type information preserved.

    Success rate: ~95% when metadata available
    Speed: Fast (direct metadata lookup)
    Use case: When you control both serialization and deserialization

    Args:
        data: Serialized data with embedded type metadata
        config: Configuration for type reconstruction
        **kwargs: Additional options

    Returns:
        Type-accurate deserialized Python object

    Example:
        >>> # Works best with datason-serialized data
        >>> serialized = dump(original_object)  # Preserves type info
        >>> result = load_typed(serialized)
        >>> # High fidelity reconstruction using embedded metadata
    """
    if config is None:
        config = get_strict_config()  # Use strict config for best type preservation
    return deserialize_fast(data, config=config, **kwargs)

Utility Functions

datason.dumps(obj: Any, **kwargs: Any) -> Any

Enhanced serialization returning dict (DataSON's smart default).

This is DataSON's enhanced API that returns a dict with smart type handling, datetime parsing, ML support, and other advanced features.

For JSON string output or stdlib compatibility, use datason.json.dumps() or dumps_json().

Parameters:

Name Type Description Default
obj Any

Object to serialize

required
**kwargs Any

DataSON configuration options

{}

Returns:

Type Description
Any

Serialized dict with enhanced type handling

Examples:

>>> obj = {"timestamp": datetime.now(), "data": [1, 2, 3]}
>>> result = dumps(obj)  # Returns dict with smart datetime handling
>>> # For JSON string compatibility:
>>> import datason.json as json
>>> json_str = json.dumps(obj)  # Returns JSON string
Source code in datason/api.py
def dumps(obj: Any, **kwargs: Any) -> Any:
    """Enhanced serialization returning dict (DataSON's smart default).

    This is DataSON's enhanced API that returns a dict with smart type handling,
    datetime parsing, ML support, and other advanced features.

    For JSON string output or stdlib compatibility, use datason.json.dumps() or dumps_json().

    Args:
        obj: Object to serialize
        **kwargs: DataSON configuration options

    Returns:
        Serialized dict with enhanced type handling

    Examples:
        >>> obj = {"timestamp": datetime.now(), "data": [1, 2, 3]}
        >>> result = dumps(obj)  # Returns dict with smart datetime handling

        >>> # For JSON string compatibility:
        >>> import datason.json as json
        >>> json_str = json.dumps(obj)  # Returns JSON string
    """
    # Use enhanced serialization with smart defaults
    return serialize(obj, **kwargs)

datason.loads(s: str, **kwargs: Any) -> Any

Enhanced JSON string deserialization (DataSON's smart default).

This provides smart deserialization with datetime parsing, type reconstruction, and other DataSON enhancements. For stdlib json.loads() compatibility, use datason.json.loads() or loads_json().

Parameters:

Name Type Description Default
s str

JSON string to deserialize

required
**kwargs Any

DataSON configuration options

{}

Returns:

Type Description
Any

Deserialized Python object with enhanced type handling

Example

json_str = '{"timestamp": "2024-01-01T00:00:00Z", "data": [1, 2, 3]}' result = loads(json_str) # Smart parsing with datetime handling

For JSON compatibility:

import datason.json as json result = json.loads(json_str) # Exact json.loads() behavior

Source code in datason/api.py
def loads(s: str, **kwargs: Any) -> Any:
    """Enhanced JSON string deserialization (DataSON's smart default).

    This provides smart deserialization with datetime parsing, type reconstruction,
    and other DataSON enhancements. For stdlib json.loads() compatibility,
    use datason.json.loads() or loads_json().

    Args:
        s: JSON string to deserialize
        **kwargs: DataSON configuration options

    Returns:
        Deserialized Python object with enhanced type handling

    Example:
        >>> json_str = '{"timestamp": "2024-01-01T00:00:00Z", "data": [1, 2, 3]}'
        >>> result = loads(json_str)  # Smart parsing with datetime handling

        >>> # For JSON compatibility:
        >>> import datason.json as json
        >>> result = json.loads(json_str)  # Exact json.loads() behavior
    """
    import json

    # Parse with standard json, then enhance with smart processing
    data = json.loads(s)
    return load_smart(data, **kwargs)

datason.help_api() -> Dict[str, Any]

Get help on choosing the right API function.

Returns:

Type Description
Dict[str, Any]

Dictionary with API guidance and function recommendations

Example

help_info = help_api() print(help_info['recommendations'])

Source code in datason/api.py
def help_api() -> Dict[str, Any]:
    """Get help on choosing the right API function.

    Returns:
        Dictionary with API guidance and function recommendations

    Example:
        >>> help_info = help_api()
        >>> print(help_info['recommendations'])
    """
    return {
        "serialization": {
            "basic": {"function": "dump()", "use_case": "General purpose serialization", "example": "dump(data)"},
            "ml_optimized": {
                "function": "dump_ml()",
                "use_case": "ML models, tensors, NumPy arrays",
                "example": "dump_ml(sklearn_model)",
            },
            "api_safe": {
                "function": "dump_api()",
                "use_case": "Web APIs, clean JSON output",
                "example": "dump_api(response_data)",
            },
            "secure": {
                "function": "dump_secure()",
                "use_case": "Sensitive data with PII redaction",
                "example": "dump_secure(user_data, redact_pii=True)",
            },
            "performance": {
                "function": "dump_fast()",
                "use_case": "High-throughput scenarios",
                "example": "dump_fast(large_dataset)",
            },
            "chunked": {
                "function": "dump_chunked()",
                "use_case": "Very large objects, memory efficiency",
                "example": "dump_chunked(huge_list, chunk_size=1000)",
            },
        },
        "deserialization": {
            "basic": {
                "function": "load_basic()",
                "success_rate": "60-70%",
                "speed": "Fastest",
                "use_case": "Simple objects, data exploration",
            },
            "smart": {
                "function": "load_smart()",
                "success_rate": "80-90%",
                "speed": "Moderate",
                "use_case": "General purpose, production data",
            },
            "perfect": {
                "function": "load_perfect()",
                "success_rate": "100%",
                "speed": "Fast",
                "use_case": "Critical applications, requires template",
                "example": "load_perfect(data, template)",
            },
            "typed": {
                "function": "load_typed()",
                "success_rate": "95%",
                "speed": "Fast",
                "use_case": "When metadata available",
            },
        },
        "file_operations": {
            "save_ml": {
                "function": "save_ml()",
                "use_case": "ML models/data to JSON/JSONL files",
                "examples": [
                    "save_ml(model, 'model.json')    # Single JSON object",
                    "save_ml(model, 'model.jsonl')   # Multiple JSONL objects",
                    "save_ml(model, 'model.txt', format='json')  # Explicit format",
                ],
            },
            "save_secure": {
                "function": "save_secure()",
                "use_case": "Secure JSON/JSONL with redaction",
                "examples": [
                    "save_secure(data, 'secure.json', redact_pii=True)",
                    "save_secure(data, 'secure.jsonl', redact_pii=True)",
                ],
            },
            "load_file": {
                "function": "load_smart_file()",
                "use_case": "Smart loading from JSON/JSONL files",
                "examples": [
                    "list(load_smart_file('data.json'))",
                    "list(load_smart_file('data.jsonl'))",
                    "list(load_smart_file('data.txt', format='json'))",
                ],
            },
        },
        "recommendations": [
            "For ML workflows: save_ml() + load_perfect_file() with template",
            "For APIs: save_api() + load_smart_file()",
            "For sensitive data: save_secure() + load_smart_file()",
            "For exploration: dump() + load_basic()",
            "For production: save_ml() + load_smart_file()",
        ],
    }

datason.get_api_info() -> Dict[str, Any]

Get information about the modern API.

Returns:

Type Description
Dict[str, Any]

Dictionary with API version and feature information

Source code in datason/api.py
def get_api_info() -> Dict[str, Any]:
    """Get information about the modern API.

    Returns:
        Dictionary with API version and feature information
    """
    return {
        "api_version": "modern",
        "phase": "3",
        "features": {
            "intention_revealing_names": True,
            "compositional_utilities": True,
            "domain_specific_convenience": True,
            "progressive_complexity": True,
            "backward_compatibility": True,
            "file_operations": True,
        },
        "dump_functions": ["dump", "dump_ml", "dump_api", "dump_secure", "dump_fast", "dump_chunked", "stream_dump"],
        "load_functions": ["load_basic", "load_smart", "load_perfect", "load_typed"],
        "file_functions": ["save_ml", "save_secure", "save_api", "load_smart_file", "load_perfect_file"],
        "convenience": ["loads", "dumps"],
        "help": ["help_api", "get_api_info"],
    }

📋 Traditional API Functions

Core Functions

datason.serialize(obj: Any, config: Any = None, **kwargs: Any) -> Any

Serialize an object (DEPRECATED - use dump/dumps instead).

DEPRECATION WARNING: Direct use of serialize() is discouraged. Use the clearer API functions instead: - dump(obj, file) - write to file (like json.dump) - dumps(obj) - convert to string (like json.dumps) - serialize_enhanced(obj, **options) - enhanced serialization with clear options

Parameters:

Name Type Description Default
obj Any

Object to serialize

required
config Any

Optional configuration

None
**kwargs Any

Additional options

{}

Returns:

Type Description
Any

Serialized object

Source code in datason/__init__.py
def serialize(obj: Any, config: Any = None, **kwargs: Any) -> Any:
    """Serialize an object (DEPRECATED - use dump/dumps instead).

    DEPRECATION WARNING: Direct use of serialize() is discouraged.
    Use the clearer API functions instead:
    - dump(obj, file) - write to file (like json.dump)
    - dumps(obj) - convert to string (like json.dumps)
    - serialize_enhanced(obj, **options) - enhanced serialization with clear options

    Args:
        obj: Object to serialize
        config: Optional configuration
        **kwargs: Additional options

    Returns:
        Serialized object
    """
    import warnings

    warnings.warn(
        "serialize() is deprecated. Use dump/dumps for JSON compatibility or "
        "serialize_enhanced() for advanced features. Direct serialize() will be "
        "removed in a future version.",
        DeprecationWarning,
        stacklevel=2,
    )
    return _serialize_core(obj, config, **kwargs)

datason.deserialize(obj: Any, parse_dates: bool = True, parse_uuids: bool = True) -> Any

Recursively deserialize JSON-compatible data back to Python objects.

Attempts to intelligently restore datetime objects, UUIDs, and other types that were serialized to strings by the serialize function.

Parameters:

Name Type Description Default
obj Any

The JSON-compatible object to deserialize

required
parse_dates bool

Whether to attempt parsing ISO datetime strings back to datetime objects

True
parse_uuids bool

Whether to attempt parsing UUID strings back to UUID objects

True

Returns:

Type Description
Any

Python object with restored types where possible

Examples:

>>> data = {"date": "2023-01-01T12:00:00", "id": "12345678-1234-5678-9012-123456789abc"}
>>> deserialize(data)
{"date": datetime(2023, 1, 1, 12, 0), "id": UUID('12345678-1234-5678-9012-123456789abc')}
Source code in datason/deserializers_new.py
def deserialize(obj: Any, parse_dates: bool = True, parse_uuids: bool = True) -> Any:
    """Recursively deserialize JSON-compatible data back to Python objects.

    Attempts to intelligently restore datetime objects, UUIDs, and other types
    that were serialized to strings by the serialize function.

    Args:
        obj: The JSON-compatible object to deserialize
        parse_dates: Whether to attempt parsing ISO datetime strings back to datetime objects
        parse_uuids: Whether to attempt parsing UUID strings back to UUID objects

    Returns:
        Python object with restored types where possible

    Examples:
        >>> data = {"date": "2023-01-01T12:00:00", "id": "12345678-1234-5678-9012-123456789abc"}
        >>> deserialize(data)
        {"date": datetime(2023, 1, 1, 12, 0), "id": UUID('12345678-1234-5678-9012-123456789abc')}
    """
    # ==================================================================================
    # IDEMPOTENCY CHECKS: Prevent double deserialization
    # ==================================================================================

    # IDEMPOTENCY CHECK 1: Check if object is already in final deserialized form
    if _is_already_deserialized(obj):
        return obj

    if obj is None:
        return None

    # NEW: Handle type metadata for round-trip serialization
    if isinstance(obj, dict) and TYPE_METADATA_KEY in obj:
        return _deserialize_with_type_metadata(obj)

    # Handle basic types (already in correct format)
    if isinstance(obj, (int, float, bool)):
        return obj

    # Handle strings - attempt intelligent parsing
    if isinstance(obj, str):
        # Try to parse as UUID first (more specific pattern)
        if parse_uuids and _looks_like_uuid(obj):
            try:
                import uuid as uuid_module  # Fresh import to avoid state issues

                return uuid_module.UUID(obj)
            except (ValueError, ImportError):
                # Log parsing failure but continue with string
                warnings.warn(f"Failed to parse UUID string: {obj}", stacklevel=2)

        # Try to parse as datetime if enabled
        if parse_dates and _looks_like_datetime(obj):
            try:
                import sys
                from datetime import datetime as datetime_class  # Fresh import

                # Handle 'Z' timezone suffix for Python < 3.11
                date_str = obj.replace("Z", "+00:00") if obj.endswith("Z") and sys.version_info < (3, 11) else obj
                return datetime_class.fromisoformat(date_str)
            except (ValueError, ImportError):
                # Log parsing failure but continue with string
                warnings.warn(
                    f"Failed to parse datetime string: {obj[:50]}{'...' if len(obj) > 50 else ''}",
                    stacklevel=2,
                )

        # Return as string if no parsing succeeded
        return obj

    # Handle lists
    if isinstance(obj, list):
        return [deserialize(item, parse_dates, parse_uuids) for item in obj]

    # Handle dictionaries
    if isinstance(obj, dict):
        return {k: deserialize(v, parse_dates, parse_uuids) for k, v in obj.items()}

    # For any other type, return as-is
    return obj

datason.auto_deserialize(obj: Any, aggressive: bool = False, config: Optional[SerializationConfig] = None) -> Any

NEW: Intelligent auto-detection deserialization with heuristics.

Uses pattern recognition and heuristics to automatically detect and restore complex data types without explicit configuration.

Parameters:

Name Type Description Default
obj Any

JSON-compatible object to deserialize

required
aggressive bool

Whether to use aggressive type detection (may have false positives)

False
config Optional[SerializationConfig]

Configuration object to control deserialization behavior

None

Returns:

Type Description
Any

Python object with auto-detected types restored

Examples:

>>> data = {"records": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}
>>> auto_deserialize(data, aggressive=True)
{"records": DataFrame(...)}  # May detect as DataFrame
>>> # API-compatible UUID handling
>>> from datason.config import get_api_config
>>> auto_deserialize("12345678-1234-5678-9012-123456789abc", config=get_api_config())
"12345678-1234-5678-9012-123456789abc"  # Stays as string
Source code in datason/deserializers_new.py
def auto_deserialize(obj: Any, aggressive: bool = False, config: Optional["SerializationConfig"] = None) -> Any:
    """NEW: Intelligent auto-detection deserialization with heuristics.

    Uses pattern recognition and heuristics to automatically detect and restore
    complex data types without explicit configuration.

    Args:
        obj: JSON-compatible object to deserialize
        aggressive: Whether to use aggressive type detection (may have false positives)
        config: Configuration object to control deserialization behavior

    Returns:
        Python object with auto-detected types restored

    Examples:
        >>> data = {"records": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]}
        >>> auto_deserialize(data, aggressive=True)
        {"records": DataFrame(...)}  # May detect as DataFrame

        >>> # API-compatible UUID handling
        >>> from datason.config import get_api_config
        >>> auto_deserialize("12345678-1234-5678-9012-123456789abc", config=get_api_config())
        "12345678-1234-5678-9012-123456789abc"  # Stays as string
    """
    # ==================================================================================
    # IDEMPOTENCY CHECKS: Prevent double deserialization
    # ==================================================================================

    # IDEMPOTENCY CHECK 1: Check if object is already in final deserialized form
    if _is_already_deserialized(obj):
        return obj

    if obj is None:
        return None

    # Get default config if none provided
    if config is None and _config_available:
        config = get_default_config()

    # Handle type metadata first
    if isinstance(obj, dict) and TYPE_METADATA_KEY in obj:
        return _deserialize_with_type_metadata(obj)

    # Handle basic types
    if isinstance(obj, (int, float, bool)):
        return obj

    # Handle strings with auto-detection
    if isinstance(obj, str):
        return _auto_detect_string_type(obj, aggressive, config)

    # Handle lists with auto-detection
    if isinstance(obj, list):
        deserialized_list = [auto_deserialize(item, aggressive, config) for item in obj]

        if aggressive and pd is not None and _looks_like_series_data(deserialized_list):
            # Try to detect if this should be a pandas Series or DataFrame
            try:
                return pd.Series(deserialized_list)
            except Exception:  # nosec B110
                pass

        return deserialized_list

    # Handle dictionaries with auto-detection
    if isinstance(obj, dict):
        # Check for pandas DataFrame patterns first
        if aggressive and pd is not None and _looks_like_dataframe_dict(obj):
            try:
                return _reconstruct_dataframe(obj)
            except Exception:  # nosec B110
                pass

        # Check for pandas split format
        if pd is not None and _looks_like_split_format(obj):
            try:
                return _reconstruct_from_split(obj)
            except Exception:  # nosec B110
                pass

        # Standard dictionary deserialization
        return {k: auto_deserialize(v, aggressive, config) for k, v in obj.items()}

    return obj

datason.safe_deserialize(json_str: str, allow_pickle: bool = False, **kwargs: Any) -> Any

Safely deserialize a JSON string, handling parse errors gracefully.

Parameters:

Name Type Description Default
json_str str

JSON string to parse and deserialize

required
allow_pickle bool

Whether to allow deserialization of pickle-serialized objects

False
**kwargs Any

Arguments passed to deserialize()

{}

Returns:

Type Description
Any

Deserialized Python object, or the original string if parsing fails

Raises:

Type Description
DeserializationSecurityError

If pickle data is detected and allow_pickle=False

Source code in datason/deserializers_new.py
def safe_deserialize(json_str: str, allow_pickle: bool = False, **kwargs: Any) -> Any:
    """Safely deserialize a JSON string, handling parse errors gracefully.

    Args:
        json_str: JSON string to parse and deserialize
        allow_pickle: Whether to allow deserialization of pickle-serialized objects
        **kwargs: Arguments passed to deserialize()

    Returns:
        Deserialized Python object, or the original string if parsing fails

    Raises:
        DeserializationSecurityError: If pickle data is detected and allow_pickle=False
    """
    import json

    try:
        parsed = json.loads(json_str)

        # Security check for pickle data
        if not allow_pickle and _contains_pickle_data(parsed):
            raise DeserializationSecurityError(
                "Detected pickle-serialized objects which are unsafe to deserialize. "
                "Set allow_pickle=True to override this security check."
            )

        return deserialize(parsed, **kwargs)
    except (json.JSONDecodeError, TypeError, ValueError):
        return json_str

Chunked & Streaming

datason.serialize_chunked(obj: Any, chunk_size: int = 1000, config: Optional[SerializationConfig] = None, memory_limit_mb: Optional[int] = None) -> ChunkedSerializationResult

Serialize large objects in memory-bounded chunks.

This function breaks large objects (lists, DataFrames, arrays) into smaller chunks to enable processing of datasets larger than available memory.

Parameters:

Name Type Description Default
obj Any

Object to serialize (typically list, DataFrame, or array)

required
chunk_size int

Number of items per chunk

1000
config Optional[SerializationConfig]

Serialization configuration

None
memory_limit_mb Optional[int]

Optional memory limit in MB (not enforced yet, for future use)

None

Returns:

Type Description
ChunkedSerializationResult

ChunkedSerializationResult with iterator of serialized chunks

Examples:

>>> large_list = list(range(10000))
>>> result = serialize_chunked(large_list, chunk_size=100)
>>> chunks = result.to_list()  # Get all chunks
>>> len(chunks)  # 100 chunks of 100 items each
100
>>> # Save directly to file without loading all chunks
>>> result = serialize_chunked(large_data, chunk_size=1000)
>>> result.save_to_file("large_data.jsonl", format="jsonl")
Source code in datason/core_new.py
def serialize_chunked(
    obj: Any,
    chunk_size: int = 1000,
    config: Optional["SerializationConfig"] = None,
    memory_limit_mb: Optional[int] = None,
) -> ChunkedSerializationResult:
    """Serialize large objects in memory-bounded chunks.

    This function breaks large objects (lists, DataFrames, arrays) into smaller chunks
    to enable processing of datasets larger than available memory.

    Args:
        obj: Object to serialize (typically list, DataFrame, or array)
        chunk_size: Number of items per chunk
        config: Serialization configuration
        memory_limit_mb: Optional memory limit in MB (not enforced yet, for future use)

    Returns:
        ChunkedSerializationResult with iterator of serialized chunks

    Examples:
        >>> large_list = list(range(10000))
        >>> result = serialize_chunked(large_list, chunk_size=100)
        >>> chunks = result.to_list()  # Get all chunks
        >>> len(chunks)  # 100 chunks of 100 items each
        100

        >>> # Save directly to file without loading all chunks
        >>> result = serialize_chunked(large_data, chunk_size=1000)
        >>> result.save_to_file("large_data.jsonl", format="jsonl")
    """
    if config is None and _config_available:
        config = get_default_config()

    # Determine chunking strategy based on object type
    if isinstance(obj, (list, tuple)):
        return _chunk_sequence(obj, chunk_size, config)
    elif pd is not None and isinstance(obj, pd.DataFrame):
        return _chunk_dataframe(obj, chunk_size, config)
    elif np is not None and isinstance(obj, np.ndarray):
        return _chunk_numpy_array(obj, chunk_size, config)
    elif isinstance(obj, dict):
        return _chunk_dict(obj, chunk_size, config)
    else:
        # For non-chunnable objects, return single chunk
        single_chunk = serialize(obj, config)
        metadata = {
            "total_chunks": 1,
            "chunk_size": chunk_size,
            "object_type": type(obj).__name__,
            "chunking_strategy": "single_object",
        }
        return ChunkedSerializationResult(iter([single_chunk]), metadata)

datason.stream_serialize(file_path: Union[str, Path], config: Optional[SerializationConfig] = None, format: str = 'jsonl', buffer_size: int = 8192) -> StreamingSerializer

Create a streaming serializer context manager.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to output file

required
config Optional[SerializationConfig]

Serialization configuration

None
format str

Output format ('jsonl' or 'json')

'jsonl'
buffer_size int

Write buffer size in bytes

8192

Returns:

Type Description
StreamingSerializer

StreamingSerializer context manager

Examples:

>>> with stream_serialize("large_data.jsonl") as stream:
...     for item in large_dataset:
...         stream.write(item)
>>> # Or write chunked data
>>> with stream_serialize("massive_data.jsonl") as stream:
...     stream.write_chunked(massive_dataframe, chunk_size=1000)
Source code in datason/core_new.py
def stream_serialize(
    file_path: Union[str, Path],
    config: Optional["SerializationConfig"] = None,
    format: str = "jsonl",
    buffer_size: int = 8192,
) -> StreamingSerializer:
    """Create a streaming serializer context manager.

    Args:
        file_path: Path to output file
        config: Serialization configuration
        format: Output format ('jsonl' or 'json')
        buffer_size: Write buffer size in bytes

    Returns:
        StreamingSerializer context manager

    Examples:
        >>> with stream_serialize("large_data.jsonl") as stream:
        ...     for item in large_dataset:
        ...         stream.write(item)

        >>> # Or write chunked data
        >>> with stream_serialize("massive_data.jsonl") as stream:
        ...     stream.write_chunked(massive_dataframe, chunk_size=1000)
    """
    return StreamingSerializer(file_path, config, format, buffer_size)

datason.deserialize_chunked_file(file_path: Union[str, Path], format: str = 'jsonl', chunk_processor: Optional[Callable[[Any], Any]] = None) -> Generator[Any, None, None]

Deserialize a chunked file created with streaming serialization.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to the chunked file

required
format str

File format ('jsonl' or 'json')

'jsonl'
chunk_processor Optional[Callable[[Any], Any]]

Optional function to process each chunk

None

Yields:

Type Description
Any

Deserialized chunks from the file

Examples:

>>> # Process chunks one at a time (memory efficient)
>>> for chunk in deserialize_chunked_file("large_data.jsonl"):
...     process_chunk(chunk)
>>> # Apply custom processing to each chunk
>>> def process_chunk(chunk):
...     return [item * 2 for item in chunk]
>>>
>>> processed_chunks = list(deserialize_chunked_file(
...     "data.jsonl",
...     chunk_processor=process_chunk
... ))
Source code in datason/core_new.py
def deserialize_chunked_file(
    file_path: Union[str, Path], format: str = "jsonl", chunk_processor: Optional[Callable[[Any], Any]] = None
) -> Generator[Any, None, None]:
    """Deserialize a chunked file created with streaming serialization.

    Args:
        file_path: Path to the chunked file
        format: File format ('jsonl' or 'json')
        chunk_processor: Optional function to process each chunk

    Yields:
        Deserialized chunks from the file

    Examples:
        >>> # Process chunks one at a time (memory efficient)
        >>> for chunk in deserialize_chunked_file("large_data.jsonl"):
        ...     process_chunk(chunk)

        >>> # Apply custom processing to each chunk
        >>> def process_chunk(chunk):
        ...     return [item * 2 for item in chunk]
        >>>
        >>> processed_chunks = list(deserialize_chunked_file(
        ...     "data.jsonl",
        ...     chunk_processor=process_chunk
        ... ))
    """
    import gzip

    file_path = Path(file_path)

    # Auto-detect gzip compression by checking magic number
    is_gzipped = False
    try:
        with file_path.open("rb") as f:
            magic = f.read(2)
            is_gzipped = magic == b"\x1f\x8b"
    except OSError:
        is_gzipped = False

    if format.lower() == "jsonl":
        # JSON Lines format - one object per line
        if is_gzipped:
            with gzip.open(file_path, "rt", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            chunk = json.loads(line)
                            if chunk_processor:
                                chunk = chunk_processor(chunk)
                            yield chunk
                        except json.JSONDecodeError as e:
                            warnings.warn(f"Invalid JSON line: {line[:100]}... Error: {e}", stacklevel=2)
                            continue
        else:
            with file_path.open("r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line:
                        try:
                            chunk = json.loads(line)
                            if chunk_processor:
                                chunk = chunk_processor(chunk)
                            yield chunk
                        except json.JSONDecodeError as e:
                            warnings.warn(f"Invalid JSON line: {line[:100]}... Error: {e}", stacklevel=2)
                            continue

    elif format.lower() == "json":
        # JSON format with array
        if is_gzipped:
            with gzip.open(file_path, "rt", encoding="utf-8") as f:
                data = json.load(f)
        else:
            with file_path.open("r", encoding="utf-8") as f:
                data = json.load(f)

        # Handle different data structures
        if isinstance(data, list):
            # Direct list of items
            for chunk in data:
                if chunk_processor:
                    chunk = chunk_processor(chunk)
                yield chunk
        elif isinstance(data, dict):
            # Support both 'chunks' (from ChunkedSerializationResult) and 'data' (from StreamingSerializer)
            chunks = data.get("chunks", data.get("data", None))
            if chunks is not None:
                # This is a chunked data structure
                for chunk in chunks:
                    if chunk_processor:
                        chunk = chunk_processor(chunk)
                    yield chunk
            else:
                # This is a regular dict - treat as single item
                if chunk_processor:
                    data = chunk_processor(data)
                yield data
        else:
            # Single item
            if chunk_processor:
                data = chunk_processor(data)
            yield data

    else:
        raise ValueError(f"Unsupported format: {format}. Use 'jsonl' or 'json'")

datason.estimate_memory_usage(obj: Any, config: Optional[SerializationConfig] = None) -> Dict[str, Any]

Estimate memory usage for serializing an object.

This is a rough estimation to help users decide on chunking strategies.

Parameters:

Name Type Description Default
obj Any

Object to analyze

required
config Optional[SerializationConfig]

Serialization configuration

None

Returns:

Type Description
Dict[str, Any]

Dictionary with memory usage estimates

Examples:

>>> import pandas as pd
>>> df = pd.DataFrame({'a': range(10000), 'b': range(10000)})
>>> stats = estimate_memory_usage(df)
>>> print(f"Estimated serialized size: {stats['estimated_serialized_mb']:.1f} MB")
>>> print(f"Recommended chunk size: {stats['recommended_chunk_size']}")
Source code in datason/core_new.py
def estimate_memory_usage(obj: Any, config: Optional["SerializationConfig"] = None) -> Dict[str, Any]:
    """Estimate memory usage for serializing an object.

    This is a rough estimation to help users decide on chunking strategies.

    Args:
        obj: Object to analyze
        config: Serialization configuration

    Returns:
        Dictionary with memory usage estimates

    Examples:
        >>> import pandas as pd
        >>> df = pd.DataFrame({'a': range(10000), 'b': range(10000)})
        >>> stats = estimate_memory_usage(df)
        >>> print(f"Estimated serialized size: {stats['estimated_serialized_mb']:.1f} MB")
        >>> print(f"Recommended chunk size: {stats['recommended_chunk_size']}")
    """
    import sys

    # Get basic object size
    object_size_bytes = sys.getsizeof(obj)

    # Estimate based on object type
    if isinstance(obj, (list, tuple)) or pd is not None and isinstance(obj, pd.DataFrame):
        item_count = len(obj)
        estimated_item_size = object_size_bytes / max(item_count, 1)
    elif np is not None and isinstance(obj, np.ndarray):
        item_count = obj.shape[0] if obj.ndim > 0 else 1
        estimated_item_size = object_size_bytes / max(item_count, 1)
    elif isinstance(obj, dict):
        item_count = len(obj)
        estimated_item_size = object_size_bytes / max(item_count, 1)
    else:
        item_count = 1
        estimated_item_size = object_size_bytes

    # Serialization typically increases size by 1.5-3x for complex objects
    serialization_overhead = 2.0
    estimated_serialized_bytes = object_size_bytes * serialization_overhead

    # Recommend chunk size to keep chunks under 50MB
    target_chunk_size_mb = 50
    target_chunk_size_bytes = target_chunk_size_mb * 1024 * 1024

    if estimated_item_size > 0:
        recommended_chunk_size = max(1, int(target_chunk_size_bytes / (estimated_item_size * serialization_overhead)))
    else:
        recommended_chunk_size = 1000  # Default fallback

    return {
        "object_type": type(obj).__name__,
        "object_size_mb": object_size_bytes / (1024 * 1024),
        "estimated_serialized_mb": estimated_serialized_bytes / (1024 * 1024),
        "item_count": item_count,
        "estimated_item_size_bytes": estimated_item_size,
        "recommended_chunk_size": recommended_chunk_size,
        "recommended_chunks": max(1, item_count // recommended_chunk_size),
    }

Configuration Functions

datason.get_ml_config() -> SerializationConfig

Get configuration optimized for ML workflows.

Returns:

Type Description
SerializationConfig

Configuration with aggressive type coercion and tensor-friendly settings

Source code in datason/config.py
def get_ml_config() -> SerializationConfig:
    """Get configuration optimized for ML workflows.

    Returns:
        Configuration with aggressive type coercion and tensor-friendly settings
    """
    return SerializationConfig(
        date_format=DateFormat.UNIX_MS,
        dataframe_orient=DataFrameOrient.RECORDS,
        nan_handling=NanHandling.NULL,
        type_coercion=TypeCoercion.AGGRESSIVE,
        preserve_decimals=False,  # ML often doesn't need exact decimal precision
        preserve_complex=False,  # ML typically converts complex to real
        sort_keys=True,  # Consistent output for ML pipelines
        include_type_hints=True,  # Enable type metadata for ML objects
    )

datason.get_api_config() -> SerializationConfig

Get configuration optimized for API responses.

Returns:

Type Description
SerializationConfig

Configuration with clean, consistent output for web APIs

Source code in datason/config.py
def get_api_config() -> SerializationConfig:
    """Get configuration optimized for API responses.

    Returns:
        Configuration with clean, consistent output for web APIs
    """
    return SerializationConfig(
        date_format=DateFormat.ISO,
        dataframe_orient=DataFrameOrient.RECORDS,
        nan_handling=NanHandling.NULL,
        type_coercion=TypeCoercion.SAFE,
        preserve_decimals=True,
        preserve_complex=True,
        sort_keys=True,
        ensure_ascii=True,  # Safe for all HTTP clients
        # NEW: Keep UUIDs as strings for API compatibility (Pydantic/FastAPI)
        uuid_format="string",
        parse_uuids=False,
    )

datason.get_strict_config() -> SerializationConfig

Get configuration with strict type checking.

Returns:

Type Description
SerializationConfig

Configuration that raises errors on unknown types

Source code in datason/config.py
def get_strict_config() -> SerializationConfig:
    """Get configuration with strict type checking.

    Returns:
        Configuration that raises errors on unknown types
    """
    return SerializationConfig(
        date_format=DateFormat.ISO,
        dataframe_orient=DataFrameOrient.RECORDS,
        nan_handling=NanHandling.NULL,
        type_coercion=TypeCoercion.STRICT,
        preserve_decimals=True,
        preserve_complex=True,
    )

datason.get_performance_config() -> SerializationConfig

Get configuration optimized for performance.

Returns:

Type Description
SerializationConfig

Configuration with minimal processing for maximum speed

Source code in datason/config.py
def get_performance_config() -> SerializationConfig:
    """Get configuration optimized for performance.

    Returns:
        Configuration with minimal processing for maximum speed
    """
    return SerializationConfig(
        date_format=DateFormat.UNIX,  # Fastest date format
        dataframe_orient=DataFrameOrient.VALUES,  # Fastest DataFrame format
        nan_handling=NanHandling.NULL,
        type_coercion=TypeCoercion.SAFE,
        preserve_decimals=False,  # Skip decimal preservation for speed
        preserve_complex=False,  # Skip complex preservation for speed
        sort_keys=False,  # Don't sort for speed
    )

Template Functions

datason.deserialize_with_template(obj: Any, template: Any, **kwargs: Any) -> Any

Convenience function for template-based deserialization.

Parameters:

Name Type Description Default
obj Any

Serialized object to deserialize

required
template Any

Template object to guide deserialization

required
**kwargs Any

Additional arguments for TemplateDeserializer

{}

Returns:

Type Description
Any

Deserialized object matching template structure

Examples:

>>> import pandas as pd
>>> template_df = pd.DataFrame({'a': [1], 'b': ['text']})
>>> serialized_data = [{'a': 2, 'b': 'hello'}, {'a': 3, 'b': 'world'}]
>>> result = deserialize_with_template(serialized_data, template_df)
>>> isinstance(result, pd.DataFrame)
True
>>> result.dtypes['a']  # Should match template
int64
Source code in datason/deserializers_new.py
def deserialize_with_template(obj: Any, template: Any, **kwargs: Any) -> Any:
    """Convenience function for template-based deserialization.

    Args:
        obj: Serialized object to deserialize
        template: Template object to guide deserialization
        **kwargs: Additional arguments for TemplateDeserializer

    Returns:
        Deserialized object matching template structure

    Examples:
        >>> import pandas as pd
        >>> template_df = pd.DataFrame({'a': [1], 'b': ['text']})
        >>> serialized_data = [{'a': 2, 'b': 'hello'}, {'a': 3, 'b': 'world'}]
        >>> result = deserialize_with_template(serialized_data, template_df)
        >>> isinstance(result, pd.DataFrame)
        True
        >>> result.dtypes['a']  # Should match template
        int64
    """
    deserializer = TemplateDeserializer(template, **kwargs)
    return deserializer.deserialize(obj)

datason.infer_template_from_data(data: Any, max_samples: int = 100) -> Any

Infer a template from sample data.

This function analyzes sample data to create a template that can be used for subsequent template-based deserialization.

Parameters:

Name Type Description Default
data Any

Sample data to analyze (list of records, DataFrame, etc.)

required
max_samples int

Maximum number of samples to analyze

100

Returns:

Type Description
Any

Inferred template object

Examples:

>>> sample_data = [
...     {'name': 'Alice', 'age': 30, 'date': '2023-01-01T10:00:00'},
...     {'name': 'Bob', 'age': 25, 'date': '2023-01-02T11:00:00'}
... ]
>>> template = infer_template_from_data(sample_data)
>>> # template will be a dict with expected types
Source code in datason/deserializers_new.py
def infer_template_from_data(data: Any, max_samples: int = 100) -> Any:
    """Infer a template from sample data.

    This function analyzes sample data to create a template that can be used
    for subsequent template-based deserialization.

    Args:
        data: Sample data to analyze (list of records, DataFrame, etc.)
        max_samples: Maximum number of samples to analyze

    Returns:
        Inferred template object

    Examples:
        >>> sample_data = [
        ...     {'name': 'Alice', 'age': 30, 'date': '2023-01-01T10:00:00'},
        ...     {'name': 'Bob', 'age': 25, 'date': '2023-01-02T11:00:00'}
        ... ]
        >>> template = infer_template_from_data(sample_data)
        >>> # template will be a dict with expected types
    """
    if isinstance(data, list) and data:
        # Analyze list of records
        return _infer_template_from_records(data[:max_samples])
    elif pd is not None and isinstance(data, pd.DataFrame):
        # Use DataFrame structure directly as template
        return data.iloc[: min(1, len(data))].copy()
    elif pd is not None and isinstance(data, pd.Series):
        # Use Series structure directly as template
        return data.iloc[: min(1, len(data))].copy()
    elif isinstance(data, dict):
        # Use single dict as template
        return data
    else:
        # Cannot infer meaningful template
        return data

datason.create_ml_round_trip_template(ml_object: Any) -> Dict[str, Any]

Create a template optimized for ML object round-trip serialization.

This function creates templates specifically designed for machine learning workflows where perfect round-trip fidelity is crucial.

Parameters:

Name Type Description Default
ml_object Any

ML object (model, dataset, etc.) to create template for

required

Returns:

Type Description
Dict[str, Any]

Template dictionary with ML-specific metadata

Examples:

>>> import sklearn.linear_model
>>> model = sklearn.linear_model.LogisticRegression()
>>> template = create_ml_round_trip_template(model)
>>> # template will include model structure, parameters, etc.
Source code in datason/deserializers_new.py
def create_ml_round_trip_template(ml_object: Any) -> Dict[str, Any]:
    """Create a template optimized for ML object round-trip serialization.

    This function creates templates specifically designed for machine learning
    workflows where perfect round-trip fidelity is crucial.

    Args:
        ml_object: ML object (model, dataset, etc.) to create template for

    Returns:
        Template dictionary with ML-specific metadata

    Examples:
        >>> import sklearn.linear_model
        >>> model = sklearn.linear_model.LogisticRegression()
        >>> template = create_ml_round_trip_template(model)
        >>> # template will include model structure, parameters, etc.
    """
    template = {
        "__ml_template__": True,
        "object_type": type(ml_object).__name__,
        "module": getattr(ml_object, "__module__", None),
    }

    # Handle pandas objects
    if pd is not None and isinstance(ml_object, pd.DataFrame):
        template.update(
            {
                "structure_type": "dataframe",
                "columns": list(ml_object.columns),
                "dtypes": {col: str(dtype) for col, dtype in ml_object.dtypes.items()},
                "index_name": ml_object.index.name,
                "shape": ml_object.shape,
            }
        )
    elif pd is not None and isinstance(ml_object, pd.Series):
        template.update(
            {
                "structure_type": "series",
                "dtype": str(ml_object.dtype),
                "name": ml_object.name,
                "index_name": ml_object.index.name,
                "length": len(ml_object),
            }
        )

    # Handle numpy arrays
    elif np is not None and isinstance(ml_object, np.ndarray):
        template.update(
            {
                "structure_type": "numpy_array",
                "shape": ml_object.shape,
                "dtype": str(ml_object.dtype),
                "fortran_order": np.isfortran(ml_object),
            }
        )

    # Handle sklearn models
    elif hasattr(ml_object, "get_params"):
        try:
            template.update(
                {
                    "structure_type": "sklearn_model",
                    "parameters": ml_object.get_params(),
                    "fitted": hasattr(ml_object, "classes_") or hasattr(ml_object, "coef_"),
                }
            )
        except Exception:
            pass  # nosec B110

    return template

🏗️ Classes & Data Types

Configuration Classes

datason.SerializationConfig(date_format: DateFormat = DateFormat.ISO, custom_date_format: Optional[str] = None, uuid_format: str = 'object', parse_uuids: bool = True, dataframe_orient: DataFrameOrient = DataFrameOrient.RECORDS, datetime_output: OutputType = OutputType.JSON_SAFE, series_output: OutputType = OutputType.JSON_SAFE, dataframe_output: OutputType = OutputType.JSON_SAFE, numpy_output: OutputType = OutputType.JSON_SAFE, nan_handling: NanHandling = NanHandling.NULL, type_coercion: TypeCoercion = TypeCoercion.SAFE, preserve_decimals: bool = True, preserve_complex: bool = True, max_depth: int = 50, max_size: int = 100000, max_string_length: int = 1000000, custom_serializers: Optional[Dict[type, Callable[[Any], Any]]] = None, sort_keys: bool = False, ensure_ascii: bool = False, check_if_serialized: bool = False, include_type_hints: bool = False, auto_detect_types: bool = False, redact_fields: Optional[List[str]] = None, redact_patterns: Optional[List[str]] = None, redact_large_objects: bool = False, redaction_replacement: str = '<REDACTED>', include_redaction_summary: bool = False, audit_trail: bool = False, cache_scope: CacheScope = CacheScope.OPERATION, cache_size_limit: int = 1000, cache_warn_on_limit: bool = True, cache_metrics_enabled: bool = False) dataclass

Configuration for datason serialization behavior.

Attributes:

Name Type Description
date_format DateFormat

How to format datetime objects

custom_date_format Optional[str]

Custom strftime format when date_format is CUSTOM

dataframe_orient DataFrameOrient

Pandas DataFrame orientation

datetime_output OutputType

How to output datetime objects

series_output OutputType

How to output pandas Series

dataframe_output OutputType

How to output pandas DataFrames (overrides orient for object output)

numpy_output OutputType

How to output numpy arrays

nan_handling NanHandling

How to handle NaN/null values

type_coercion TypeCoercion

Type coercion behavior

preserve_decimals bool

Whether to preserve decimal.Decimal precision

preserve_complex bool

Whether to preserve complex numbers as dict

max_depth int

Maximum recursion depth (security)

max_size int

Maximum collection size (security)

max_string_length int

Maximum string length (security)

custom_serializers Optional[Dict[type, Callable[[Any], Any]]]

Dict of type -> serializer function

sort_keys bool

Whether to sort dictionary keys in output

ensure_ascii bool

Whether to ensure ASCII output only

check_if_serialized bool

Skip processing if object is already JSON-safe

include_type_hints bool

Include type metadata for perfect round-trip deserialization

redact_fields Optional[List[str]]

Field patterns to redact (e.g., ["password", "api_key", "*.secret"])

redact_patterns Optional[List[str]]

Regex patterns to redact (e.g., credit card numbers)

redact_large_objects bool

Auto-redact objects >10MB

redaction_replacement str

Replacement text for redacted content

include_redaction_summary bool

Include summary of what was redacted

audit_trail bool

Track all redaction operations for compliance

datason.ChunkedSerializationResult(chunks: Iterator[Any], metadata: Dict[str, Any])

Result container for chunked serialization operations.

Initialize chunked result.

Parameters:

Name Type Description Default
chunks Iterator[Any]

Iterator of serialized chunks

required
metadata Dict[str, Any]

Metadata about the chunking operation

required
Source code in datason/core_new.py
def __init__(self, chunks: Iterator[Any], metadata: Dict[str, Any]):
    """Initialize chunked result.

    Args:
        chunks: Iterator of serialized chunks
        metadata: Metadata about the chunking operation
    """
    self.chunks = chunks
    self.metadata = metadata

save_to_file(file_path: Union[str, Path], format: str = 'jsonl') -> None

Save chunks to a file.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to save the chunks

required
format str

Format to save ('jsonl' for JSON lines, 'json' for array)

'jsonl'
Source code in datason/core_new.py
def save_to_file(self, file_path: Union[str, Path], format: str = "jsonl") -> None:
    """Save chunks to a file.

    Args:
        file_path: Path to save the chunks
        format: Format to save ('jsonl' for JSON lines, 'json' for array)
    """
    file_path = Path(file_path)

    with file_path.open("w") as f:
        if format == "jsonl":
            # JSON Lines format - one JSON object per line
            for chunk in self.chunks:
                json.dump(chunk, f, ensure_ascii=False)
                f.write("\n")
        elif format == "json":
            # JSON array format
            chunk_list = list(self.chunks)
            json.dump({"chunks": chunk_list, "metadata": self.metadata}, f, ensure_ascii=False, indent=2)
        else:
            raise ValueError(f"Unsupported format: {format}. Use 'jsonl' or 'json'")

to_list() -> list

Convert all chunks to a list (loads everything into memory).

Source code in datason/core_new.py
def to_list(self) -> list:
    """Convert all chunks to a list (loads everything into memory)."""
    return list(self.chunks)

datason.StreamingSerializer(file_path: Union[str, Path], config: Optional[SerializationConfig] = None, format: str = 'jsonl', buffer_size: int = 8192)

Context manager for streaming serialization to files.

Enables processing of datasets larger than available memory by writing serialized data directly to files without keeping everything in memory.

Initialize streaming serializer.

Parameters:

Name Type Description Default
file_path Union[str, Path]

Path to output file

required
config Optional[SerializationConfig]

Serialization configuration

None
format str

Output format ('jsonl' or 'json')

'jsonl'
buffer_size int

Write buffer size in bytes

8192
Source code in datason/core_new.py
def __init__(
    self,
    file_path: Union[str, Path],
    config: Optional["SerializationConfig"] = None,
    format: str = "jsonl",
    buffer_size: int = 8192,
):
    """Initialize streaming serializer.

    Args:
        file_path: Path to output file
        config: Serialization configuration
        format: Output format ('jsonl' or 'json')
        buffer_size: Write buffer size in bytes
    """
    self.file_path = Path(file_path)
    self.config = config or (get_default_config() if _config_available else None)
    self.format = format
    self.buffer_size = buffer_size
    self._file: Optional[Any] = None
    self._items_written = 0
    self._json_array_started = False

__enter__() -> StreamingSerializer

Enter context manager.

Source code in datason/core_new.py
def __enter__(self) -> "StreamingSerializer":
    """Enter context manager."""
    # Check if compression is needed based on file extension
    if self.file_path.suffix == ".gz" or (
        len(self.file_path.suffixes) > 1 and self.file_path.suffixes[-1] == ".gz"
    ):
        import gzip

        self._file = gzip.open(self.file_path, "wt", encoding="utf-8")
    else:
        self._file = self.file_path.open("w", buffering=self.buffer_size)

    if self.format == "json":
        # Start JSON array
        self._file.write('{"data": [')
        self._json_array_started = True

    return self

__exit__(exc_type: Any, exc_val: Any, exc_tb: Any) -> None

Exit context manager.

Source code in datason/core_new.py
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
    """Exit context manager."""
    if self._file:
        if self.format == "json" and self._json_array_started:
            # Close JSON array and add metadata
            self._file.write(f'], "metadata": {{"items_written": {self._items_written}}}}}')

        self._file.close()
        self._file = None

write(obj: Any) -> None

Write a single object to the stream.

Parameters:

Name Type Description Default
obj Any

Object to serialize and write

required
Source code in datason/core_new.py
def write(self, obj: Any) -> None:
    """Write a single object to the stream.

    Args:
        obj: Object to serialize and write
    """
    if not self._file:
        raise RuntimeError("StreamingSerializer not in context manager")

    serialized = serialize(obj, self.config)

    if self.format == "jsonl":
        # JSON Lines: one object per line
        json.dump(serialized, self._file, ensure_ascii=False)
        self._file.write("\n")
    elif self.format == "json":
        # JSON array format
        if self._items_written > 0:
            self._file.write(", ")
        json.dump(serialized, self._file, ensure_ascii=False)
    else:
        raise ValueError(f"Unsupported format: {self.format}")

    self._items_written += 1

write_chunked(obj: Any, chunk_size: int = 1000) -> None

Write a large object using chunked serialization.

Parameters:

Name Type Description Default
obj Any

Large object to chunk and write

required
chunk_size int

Size of each chunk

1000
Source code in datason/core_new.py
def write_chunked(self, obj: Any, chunk_size: int = 1000) -> None:
    """Write a large object using chunked serialization.

    Args:
        obj: Large object to chunk and write
        chunk_size: Size of each chunk
    """
    chunked_result = serialize_chunked(obj, chunk_size, self.config)

    for chunk in chunked_result.chunks:
        self.write(chunk)

datason.TemplateDeserializer(template: Any, strict: bool = True, fallback_auto_detect: bool = True)

Template-based deserializer for enhanced type fidelity and round-trip scenarios.

This class allows users to provide a template object that guides the deserialization process, ensuring that the output matches the expected structure and types.

Initialize template deserializer.

Parameters:

Name Type Description Default
template Any

Template object to guide deserialization

required
strict bool

If True, raise errors when structure doesn't match

True
fallback_auto_detect bool

If True, use auto-detection when template doesn't match

True
Source code in datason/deserializers_new.py
def __init__(self, template: Any, strict: bool = True, fallback_auto_detect: bool = True):
    """Initialize template deserializer.

    Args:
        template: Template object to guide deserialization
        strict: If True, raise errors when structure doesn't match
        fallback_auto_detect: If True, use auto-detection when template doesn't match
    """
    self.template = template
    self.strict = strict
    self.fallback_auto_detect = fallback_auto_detect
    self._template_info = self._analyze_template()

deserialize(obj: Any) -> Any

Deserialize object using template guidance.

Parameters:

Name Type Description Default
obj Any

Serialized object to deserialize

required

Returns:

Type Description
Any

Deserialized object matching template structure

Source code in datason/deserializers_new.py
def deserialize(self, obj: Any) -> Any:
    """Deserialize object using template guidance.

    Args:
        obj: Serialized object to deserialize

    Returns:
        Deserialized object matching template structure
    """
    try:
        return self._deserialize_with_template(obj, self.template)
    except Exception as e:
        if self.strict:
            raise TemplateDeserializationError(
                f"Failed to deserialize with template {type(self.template).__name__}: {e}"
            ) from e
        elif self.fallback_auto_detect:
            warnings.warn(f"Template deserialization failed, falling back to auto-detection: {e}", stacklevel=2)
            return auto_deserialize(obj, aggressive=True)
        else:
            return obj

Exception Classes

datason.SecurityError

Bases: Exception

Raised when security limits are exceeded during serialization.

datason.TemplateDeserializationError

Bases: Exception

Raised when template-based deserialization fails.

Enums & Constants

datason.DateFormat

Bases: Enum

Supported date/time output formats.

datason.DataFrameOrient

Bases: Enum

Supported pandas DataFrame orientations.

Based on pandas.DataFrame.to_dict() valid orientations.

datason.NanHandling

Bases: Enum

How to handle NaN/null values.

datason.TypeCoercion

Bases: Enum

Type coercion behavior.

🔧 Utility Functions

datason.safe_float(value: Any, default: float = 0.0) -> float

Convert value to float, handling NaN, None, and Inf values safely.

This function is particularly useful when working with pandas DataFrames that may contain NaN values or when processing data from external sources that may have None values.

Parameters:

Name Type Description Default
value Any

Value to convert to float

required
default float

Default value to return if conversion fails or value is NaN/None/Inf

0.0

Returns:

Type Description
float

Float value or default if conversion fails

Examples:

>>> safe_float(42.5)
42.5
>>> safe_float(None)
0.0
>>> safe_float(float('nan'))
0.0
>>> safe_float(float('inf'))
0.0
>>> safe_float("invalid", 10.0)
10.0
Source code in datason/converters.py
def safe_float(value: Any, default: float = 0.0) -> float:
    """Convert value to float, handling NaN, None, and Inf values safely.

    This function is particularly useful when working with pandas DataFrames
    that may contain NaN values or when processing data from external sources
    that may have None values.

    Args:
        value: Value to convert to float
        default: Default value to return if conversion fails or value is NaN/None/Inf

    Returns:
        Float value or default if conversion fails

    Examples:
        >>> safe_float(42.5)
        42.5
        >>> safe_float(None)
        0.0
        >>> safe_float(float('nan'))
        0.0
        >>> safe_float(float('inf'))
        0.0
        >>> safe_float("invalid", 10.0)
        10.0
    """
    if value is None:
        return default
    try:
        float_val = float(value)
        return default if (math.isnan(float_val) or math.isinf(float_val)) else float_val
    except (ValueError, TypeError):
        return default

datason.safe_int(value: Any, default: int = 0) -> int

Convert value to int, handling NaN and None values safely.

This function is particularly useful when working with pandas DataFrames that may contain NaN values or when processing data from external sources that may have None values.

Parameters:

Name Type Description Default
value Any

Value to convert to int

required
default int

Default value to return if conversion fails or value is NaN/None

0

Returns:

Type Description
int

Integer value or default if conversion fails

Examples:

>>> safe_int(42)
42
>>> safe_int(42.7)
42
>>> safe_int(None)
0
>>> safe_int(float('nan'))
0
>>> safe_int("invalid", 10)
10
Source code in datason/converters.py
def safe_int(value: Any, default: int = 0) -> int:
    """Convert value to int, handling NaN and None values safely.

    This function is particularly useful when working with pandas DataFrames
    that may contain NaN values or when processing data from external sources
    that may have None values.

    Args:
        value: Value to convert to int
        default: Default value to return if conversion fails or value is NaN/None

    Returns:
        Integer value or default if conversion fails

    Examples:
        >>> safe_int(42)
        42
        >>> safe_int(42.7)
        42
        >>> safe_int(None)
        0
        >>> safe_int(float('nan'))
        0
        >>> safe_int("invalid", 10)
        10
    """
    if value is None:
        return default
    try:
        if isinstance(value, float) and (math.isnan(value) or math.isinf(value)):
            return default
        # Handle string representations of floats
        if isinstance(value, str):
            try:
                float_val = float(value)
                if math.isnan(float_val) or math.isinf(float_val):
                    return default
                return int(float_val)
            except (ValueError, TypeError):
                return default
        return int(value)
    except (ValueError, TypeError, OverflowError):
        return default

datason.ensure_timestamp(val: Any) -> Any

Ensure a scalar date value is a pandas Timestamp. Use this for group-level date fields.

Parameters:

Name Type Description Default
val Any

A date value (can be pd.Timestamp, datetime, or string)

required

Returns:

Type Description
Any

pd.Timestamp or pd.NaT

Raises: TypeError: If input is a list, dict, or other non-date-like object

Source code in datason/datetime_utils.py
def ensure_timestamp(val: Any) -> Any:
    """Ensure a scalar date value is a pandas Timestamp. Use this for group-level date fields.

    Args:
        val: A date value (can be pd.Timestamp, datetime, or string)

    Returns:
        pd.Timestamp or pd.NaT
    Raises:
        TypeError: If input is a list, dict, or other non-date-like object
    """
    if pd is None:
        raise ImportError("pandas is required for ensure_timestamp function")

    if val is None or (isinstance(val, float) and pd.isna(val)):
        return pd.NaT
    if isinstance(val, pd.Timestamp):
        return val
    if isinstance(val, (list, dict, set)):
        logger.error(f"ensure_timestamp: Invalid type {type(val)} for value {val}")
        raise TypeError(f"Cannot convert type {type(val)} to Timestamp")
    try:
        return pd.to_datetime(val)
    except Exception as e:
        logger.warning(f"ensure_timestamp: Could not convert {val!r} to Timestamp: {e}")
        return pd.NaT

🛡️ Privacy & Security Functions

datason.create_financial_redaction_engine() -> RedactionEngine

Create a redaction engine optimized for financial data.

Source code in datason/redaction.py
def create_financial_redaction_engine() -> RedactionEngine:
    """Create a redaction engine optimized for financial data."""
    return RedactionEngine(
        redact_fields=[
            "*.password",
            "*.secret",
            "*.key",
            "*.token",
            "*.ssn",
            "*.social_security",
            "*.tax_id",
            "*.account_number",
            "*.routing_number",
            "*.credit_card",
            "*.card_number",
            "*.cvv",
            "*.pin",
        ],
        redact_patterns=[
            r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",  # Credit cards
            r"\b\d{3}-\d{2}-\d{4}\b",  # US SSN
            r"\b\d{9}\b",  # US Tax ID
            r"\b\d{10,12}\b",  # Account numbers
        ],
        redact_large_objects=True,
        large_object_threshold=5 * 1024 * 1024,  # 5MB for financial data
        include_redaction_summary=True,
        audit_trail=True,
    )

datason.create_healthcare_redaction_engine() -> RedactionEngine

Create a redaction engine optimized for healthcare data.

Source code in datason/redaction.py
def create_healthcare_redaction_engine() -> RedactionEngine:
    """Create a redaction engine optimized for healthcare data."""
    return RedactionEngine(
        redact_fields=[
            "*.patient_id",
            "*.medical_record",
            "*.ssn",
            "*.phone",
            "*.email",
            "*.address",
            "*.name",
            "*.dob",
            "*.birth_date",
            "*.diagnosis",
        ],
        redact_patterns=[
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email
            r"\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b",  # Phone
            r"\b\d{3}-\d{2}-\d{4}\b",  # SSN
        ],
        redact_large_objects=True,
        include_redaction_summary=True,
        audit_trail=True,
    )

datason.create_minimal_redaction_engine() -> RedactionEngine

Create a minimal redaction engine for basic privacy protection.

Source code in datason/redaction.py
def create_minimal_redaction_engine() -> RedactionEngine:
    """Create a minimal redaction engine for basic privacy protection."""
    return RedactionEngine(
        redact_fields=["*.password", "*.secret", "*.key", "*.token"],
        redact_patterns=[
            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",  # Email addresses
        ],
        redact_large_objects=False,
        include_redaction_summary=False,
        audit_trail=False,
    )

🧠 ML Integration Functions (Optional)

These functions are available when ML libraries are installed:

datason.detect_and_serialize_ml_object(obj: Any) -> Optional[Dict[str, Any]]

Detect and serialize ML/AI objects automatically.

Parameters:

Name Type Description Default
obj Any

Object that might be from an ML/AI library

required

Returns:

Type Description
Optional[Dict[str, Any]]

Serialized object or None if not an ML/AI object

Source code in datason/ml_serializers.py
def detect_and_serialize_ml_object(obj: Any) -> Optional[Dict[str, Any]]:
    """Detect and serialize ML/AI objects automatically.

    Args:
        obj: Object that might be from an ML/AI library

    Returns:
        Serialized object or None if not an ML/AI object
    """

    # Helper function to safely check attributes
    def safe_hasattr(obj: Any, attr: str) -> bool:
        try:
            return hasattr(obj, attr)
        except Exception:
            return False

    # PyTorch tensors
    torch = _lazy_import_torch()
    if torch is not None and isinstance(obj, torch.Tensor):
        return serialize_pytorch_tensor(obj)

    # TensorFlow tensors
    tf = _lazy_import_tensorflow()
    if (
        tf is not None
        and safe_hasattr(obj, "numpy")
        and safe_hasattr(obj, "shape")
        and safe_hasattr(obj, "dtype")
        and "tensorflow" in str(type(obj))
    ):
        return serialize_tensorflow_tensor(obj)

    # JAX arrays
    jax, jnp = _lazy_import_jax()
    if jax is not None and safe_hasattr(obj, "shape") and safe_hasattr(obj, "dtype") and "jax" in str(type(obj)):
        return serialize_jax_array(obj)

    # Scikit-learn models
    sklearn, BaseEstimator = _lazy_import_sklearn()
    if sklearn is not None and isinstance(BaseEstimator, type):
        try:
            if isinstance(obj, BaseEstimator):
                return serialize_sklearn_model(obj)
        except (TypeError, AttributeError):
            # Handle case where BaseEstimator is a Mock or invalid type
            pass

    # Scipy sparse matrices
    scipy = _lazy_import_scipy()
    if scipy is not None and safe_hasattr(obj, "tocoo") and "scipy.sparse" in str(type(obj)):
        return serialize_scipy_sparse(obj)

    # PIL Images
    Image = _lazy_import_pil()
    if Image is not None and isinstance(obj, Image.Image):
        return serialize_pil_image(obj)

    # HuggingFace tokenizers
    transformers = _lazy_import_transformers()
    if transformers is not None and safe_hasattr(obj, "encode") and "transformers" in str(type(obj)):
        return serialize_huggingface_tokenizer(obj)

    # CatBoost models - use proper isinstance check like other frameworks
    catboost = _lazy_import_catboost()
    if catboost is not None:
        try:
            if isinstance(obj, (catboost.CatBoostClassifier, catboost.CatBoostRegressor)):
                return serialize_catboost_model(obj)
        except (TypeError, AttributeError):
            pass

    # Keras models - use proper isinstance check like other frameworks
    keras = _lazy_import_keras()
    if keras is not None:
        try:
            # Check for common Keras model types
            keras_model_types = []
            if hasattr(keras, "Model"):
                keras_model_types.append(keras.Model)
            if hasattr(keras, "Sequential"):
                keras_model_types.append(keras.Sequential)
            if hasattr(keras, "models"):
                if hasattr(keras.models, "Model"):
                    keras_model_types.append(keras.models.Model)
                if hasattr(keras.models, "Sequential"):
                    keras_model_types.append(keras.models.Sequential)

            if keras_model_types and isinstance(obj, tuple(keras_model_types)):
                return serialize_keras_model(obj)
        except (TypeError, AttributeError):
            pass

    # Optuna studies - use proper isinstance check like other frameworks
    optuna = _lazy_import_optuna()
    if optuna is not None:
        try:
            if hasattr(optuna, "Study") and isinstance(obj, optuna.Study):
                return serialize_optuna_study(obj)
        except (TypeError, AttributeError):
            pass

    # Plotly figures - use proper isinstance check like other frameworks
    plotly = _lazy_import_plotly()
    if plotly is not None:
        try:
            import plotly.graph_objects as go

            if isinstance(obj, go.Figure):
                return serialize_plotly_figure(obj)
        except (TypeError, AttributeError, ImportError):
            pass

    # Polars DataFrames - use proper isinstance check like other frameworks
    polars = _lazy_import_polars()
    if polars is not None:
        try:
            if hasattr(polars, "DataFrame") and isinstance(obj, polars.DataFrame):
                return serialize_polars_dataframe(obj)
        except (TypeError, AttributeError):
            pass

    return None

📊 Cache & Performance Functions

datason.clear_all_caches() -> None

Clear all caches across all scopes (for testing/debugging).

Source code in datason/cache_manager.py
def clear_all_caches() -> None:
    """Clear all caches across all scopes (for testing/debugging)."""
    # Clear process-level caches
    _process_string_pattern_cache.clear()
    _process_parsed_object_cache.clear()
    _process_type_cache.clear()
    _process_dict_pool.clear()
    _process_list_pool.clear()

    # Clear ML serializers lazy import cache
    try:
        from . import ml_serializers

        for key in ml_serializers._LAZY_IMPORTS:
            ml_serializers._LAZY_IMPORTS[key] = None
    except ImportError:
        pass

    # Clear request-level caches if they exist
    try:
        if _request_string_pattern_cache.get() is not None:
            _request_string_pattern_cache.get().clear()
    except LookupError:
        pass

    try:
        if _request_parsed_object_cache.get() is not None:
            _request_parsed_object_cache.get().clear()
    except LookupError:
        pass

    try:
        if _request_type_cache.get() is not None:
            _request_type_cache.get().clear()
    except LookupError:
        pass

    try:
        if _request_dict_pool.get() is not None:
            _request_dict_pool.get().clear()
    except LookupError:
        pass

    try:
        if _request_list_pool.get() is not None:
            _request_list_pool.get().clear()
    except LookupError:
        pass

datason.get_cache_metrics(scope: Optional[CacheScope] = None) -> Dict[CacheScope, CacheMetrics]

Get cache metrics for a specific scope or all scopes.

Source code in datason/cache_manager.py
def get_cache_metrics(scope: Optional[CacheScope] = None) -> Dict[CacheScope, CacheMetrics]:
    """Get cache metrics for a specific scope or all scopes."""
    if scope is not None:
        return {scope: _cache_metrics[scope]}
    return dict(_cache_metrics)

📦 Package Information

datason.get_version() -> str

Get the current version of datason.

Source code in datason/__init__.py
def get_version() -> str:
    """Get the current version of datason."""
    return __version__

datason.get_info() -> dict

Get information about the datason package.

Source code in datason/__init__.py
def get_info() -> dict:
    """Get information about the datason package."""
    return {
        "version": __version__,
        "author": __author__,
        "email": __author__,
        "description": __description__,
        "config_available": _config_available,
        "cache_system": "configurable" if _config_available else "legacy",
    }