Source code for skfeaturellm.schemas

"""
Pydantic models for data validation and serialization.
"""

from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, ConfigDict, Field, model_validator

from skfeaturellm.transformations import (
    get_all_operation_types,
    get_binary_operation_types,
    get_unary_operation_types,
)



[docs]
class FeatureDescription(BaseModel):
    """
    Schema for describing a single feature in the dataset.

    Attributes:
        name: The name of the feature
        type: The data type of the feature (e.g., 'int', 'float', 'str', 'datetime')
        description: A description of what the feature represents
    """

    name: str = Field(..., description="Name of the feature")
    type: str = Field(..., description="Data type of the feature")
    description: str = Field(
        ..., description="Description of what the feature represents"
    )


[docs]
    def format(self) -> str:
        """
        Format the feature description in a human-readable way.

        Returns
        -------
        str
            Formatted feature description in the format: "name (type): description"
        """
        return f"- {self.name} ({self.type}): {self.description}"





[docs]
class FeatureDescriptions(BaseModel):
    """
    Schema for a collection of feature descriptions.

    Attributes:
        features: List of feature descriptions
    """

    features: List[FeatureDescription] = Field(
        ..., description="List of feature descriptions"
    )


[docs]
    def format(self) -> str:
        """
        Format all feature descriptions in a human-readable way.

        Returns
        -------
        str
            Formatted feature descriptions, one per line
        """
        return "\n".join(feature.format() for feature in self.features)




# =============================================================================
# Feature Engineering Ideas (LLM Output Schema)
# =============================================================================



[docs]
class TransformationParameters(BaseModel):
    """
    Parameters for transformations.

    Used for structured output compatibility.
    Explicitly defines allowed fields: constant for binary ops, power for pow op,
    n_bins for bin op.
    """

    model_config = ConfigDict(extra="forbid")

    constant: Optional[float] = Field(
        default=None,
        description="Constant value for binary operations (add, sub, mul, div with 1 column)",
    )
    power: Optional[float] = Field(
        default=None,
        description="Power exponent for pow transformation",
    )
    n_bins: Optional[int] = Field(
        default=None,
        description="Number of equal-width bins for bin transformation (must be >= 2)",
    )
    bin_edges: Optional[List[float]] = Field(
        default=None,
        description=(
            "Custom bin edges for bin transformation (e.g. [0, 50000, 100000, 200000]). "
            "Use instead of n_bins when meaningful domain thresholds are known. "
            "Must have at least 2 values."
        ),
    )




[docs]
class FeatureEngineeringIdea(BaseModel):
    """
    Schema for a feature engineering idea generated by the LLM.

    This schema is designed to map directly to the transformation executor,
    ensuring that LLM output can be reliably executed.

    Supports both binary operations (add, sub, mul, div) and unary operations
    (log, sqrt, abs, etc.).

    Attributes:
        type: The transformation type
        feature_name: Name for the resulting feature
        description: Explanation of what the feature represents and why it's useful
        columns: List of column names required for the transformation
        parameters: Optional dictionary of additional parameters (e.g., constants)

    Examples:
        Unary operation (log):
        >>> FeatureEngineeringIdea(
        ...     type="log",
        ...     feature_name="log_income",
        ...     description="Log of income to reduce skewness",
        ...     columns=["income"]
        ... )

        Binary operation (division of two columns):
        >>> FeatureEngineeringIdea(
        ...     type="div",
        ...     feature_name="income_per_person",
        ...     description="Average income per household member",
        ...     columns=["total_income", "household_size"]
        ... )

        Binary operation (multiply column by constant):
        >>> FeatureEngineeringIdea(
        ...     type="mul",
        ...     feature_name="income_doubled",
        ...     description="Income multiplied by 2 for scaling",
        ...     columns=["income"],
        ...     parameters={"constant": 2.0}
        ... )
    """

    type: str = Field(
        ...,
        description="Transformation type (must be a registered transformation)",
    )
    feature_name: str = Field(
        ...,
        description="Name for the resulting feature",
        min_length=1,
    )
    description: str = Field(
        ...,
        description="Explanation of what the feature represents and why it's useful",
    )
    columns: List[str] = Field(
        ...,
        description="List of column names required for the transformation",
        min_length=1,
    )
    parameters: Optional[TransformationParameters] = Field(
        default=None,
        description="Optional parameters for the transformation",
    )


[docs]
    @model_validator(mode="after")
    def validate_operands(self) -> "FeatureEngineeringIdea":
        """Validate operands based on transformation type."""

        all_ops = get_all_operation_types()
        unary_ops = get_unary_operation_types()
        binary_ops = get_binary_operation_types()

        # Validate that type is registered
        if self.type not in all_ops:
            raise ValueError(
                f"Unknown transformation type '{self.type}'. "
                f"Registered types: {sorted(all_ops)}"
            )

        if self.type in unary_ops:
            # Unary operations require exactly 1 column
            if len(self.columns) != 1:
                raise ValueError(
                    f"Unary operation '{self.type}' requires exactly 1 column, got {len(self.columns)}"
                )

        elif self.type in binary_ops:
            # Binary operations require 1 or 2 columns
            if len(self.columns) == 1:
                # Column-constant operation: must have constant in parameters
                if self.parameters is None or self.parameters.constant is None:
                    raise ValueError(
                        f"Binary operation '{self.type}' with 1 column requires 'constant' in parameters"
                    )
            elif len(self.columns) == 2:
                # Column-column operation: should not have constant in parameters
                if self.parameters is not None and self.parameters.constant is not None:
                    raise ValueError(
                        f"Binary operation '{self.type}' with 2 columns should not have 'constant' in parameters"
                    )
            else:
                raise ValueError(
                    f"Binary operation '{self.type}' requires 1 or 2 columns, got {len(self.columns)}"
                )

        return self



[docs]
    def to_executor_dict(self) -> dict:
        """
        Convert to a dictionary format compatible with TransformationPipeline.

        Returns
        -------
        dict
            Dictionary with type, feature_name, columns, and optional parameters
        """
        result = {
            "type": self.type,
            "feature_name": self.feature_name,
            "columns": self.columns,
        }

        if self.parameters is not None:
            result["parameters"] = self.parameters.model_dump(exclude_none=True)

        return result





[docs]
class FeatureEngineeringIdeas(BaseModel):
    """
    Schema for a list of feature engineering ideas generated by the LLM.

    This is the top-level schema used with LangChain's with_structured_output().
    """

    ideas: List[FeatureEngineeringIdea] = Field(
        ...,
        description="List of feature engineering ideas",
    )


[docs]
    def to_executor_config(self) -> dict:
        """
        Convert to a configuration dict compatible with TransformationPipeline.from_dict().

        Returns
        -------
        dict
            Dictionary with 'transformations' key containing list of transformation configs
        """
        return {"transformations": [idea.to_executor_dict() for idea in self.ideas]}