Source code for skfeaturellm.schemas

"""
Pydantic models for data validation and serialization.
"""

from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, ConfigDict, Field, model_validator

from skfeaturellm.transformations import (
    get_all_operation_types,
    get_binary_operation_types,
    get_unary_operation_types,
)


[docs] class FeatureDescription(BaseModel): """ Schema for describing a single feature in the dataset. Attributes: name: The name of the feature type: The data type of the feature (e.g., 'int', 'float', 'str', 'datetime') description: A description of what the feature represents """ name: str = Field(..., description="Name of the feature") type: str = Field(..., description="Data type of the feature") description: str = Field( ..., description="Description of what the feature represents" )
[docs] def format(self) -> str: """ Format the feature description in a human-readable way. Returns ------- str Formatted feature description in the format: "name (type): description" """ return f"- {self.name} ({self.type}): {self.description}"
[docs] class FeatureDescriptions(BaseModel): """ Schema for a collection of feature descriptions. Attributes: features: List of feature descriptions """ features: List[FeatureDescription] = Field( ..., description="List of feature descriptions" )
[docs] def format(self) -> str: """ Format all feature descriptions in a human-readable way. Returns ------- str Formatted feature descriptions, one per line """ return "\n".join(feature.format() for feature in self.features)
# ============================================================================= # Feature Engineering Ideas (LLM Output Schema) # =============================================================================
[docs] class TransformationParameters(BaseModel): """ Parameters for transformations. Used for structured output compatibility. Explicitly defines allowed fields: constant for binary ops, power for pow op, n_bins for bin op. """ model_config = ConfigDict(extra="forbid") constant: Optional[float] = Field( default=None, description="Constant value for binary operations (add, sub, mul, div with 1 column)", ) power: Optional[float] = Field( default=None, description="Power exponent for pow transformation", ) n_bins: Optional[int] = Field( default=None, description="Number of equal-width bins for bin transformation (must be >= 2)", ) bin_edges: Optional[List[float]] = Field( default=None, description=( "Custom bin edges for bin transformation (e.g. [0, 50000, 100000, 200000]). " "Use instead of n_bins when meaningful domain thresholds are known. " "Must have at least 2 values." ), )
[docs] class FeatureEngineeringIdea(BaseModel): """ Schema for a feature engineering idea generated by the LLM. This schema is designed to map directly to the transformation executor, ensuring that LLM output can be reliably executed. Supports both binary operations (add, sub, mul, div) and unary operations (log, sqrt, abs, etc.). Attributes: type: The transformation type feature_name: Name for the resulting feature description: Explanation of what the feature represents and why it's useful columns: List of column names required for the transformation parameters: Optional dictionary of additional parameters (e.g., constants) Examples: Unary operation (log): >>> FeatureEngineeringIdea( ... type="log", ... feature_name="log_income", ... description="Log of income to reduce skewness", ... columns=["income"] ... ) Binary operation (division of two columns): >>> FeatureEngineeringIdea( ... type="div", ... feature_name="income_per_person", ... description="Average income per household member", ... columns=["total_income", "household_size"] ... ) Binary operation (multiply column by constant): >>> FeatureEngineeringIdea( ... type="mul", ... feature_name="income_doubled", ... description="Income multiplied by 2 for scaling", ... columns=["income"], ... parameters={"constant": 2.0} ... ) """ type: str = Field( ..., description="Transformation type (must be a registered transformation)", ) feature_name: str = Field( ..., description="Name for the resulting feature", min_length=1, ) description: str = Field( ..., description="Explanation of what the feature represents and why it's useful", ) columns: List[str] = Field( ..., description="List of column names required for the transformation", min_length=1, ) parameters: Optional[TransformationParameters] = Field( default=None, description="Optional parameters for the transformation", )
[docs] @model_validator(mode="after") def validate_operands(self) -> "FeatureEngineeringIdea": """Validate operands based on transformation type.""" all_ops = get_all_operation_types() unary_ops = get_unary_operation_types() binary_ops = get_binary_operation_types() # Validate that type is registered if self.type not in all_ops: raise ValueError( f"Unknown transformation type '{self.type}'. " f"Registered types: {sorted(all_ops)}" ) if self.type in unary_ops: # Unary operations require exactly 1 column if len(self.columns) != 1: raise ValueError( f"Unary operation '{self.type}' requires exactly 1 column, got {len(self.columns)}" ) elif self.type in binary_ops: # Binary operations require 1 or 2 columns if len(self.columns) == 1: # Column-constant operation: must have constant in parameters if self.parameters is None or self.parameters.constant is None: raise ValueError( f"Binary operation '{self.type}' with 1 column requires 'constant' in parameters" ) elif len(self.columns) == 2: # Column-column operation: should not have constant in parameters if self.parameters is not None and self.parameters.constant is not None: raise ValueError( f"Binary operation '{self.type}' with 2 columns should not have 'constant' in parameters" ) else: raise ValueError( f"Binary operation '{self.type}' requires 1 or 2 columns, got {len(self.columns)}" ) return self
[docs] def to_executor_dict(self) -> dict: """ Convert to a dictionary format compatible with TransformationPipeline. Returns ------- dict Dictionary with type, feature_name, columns, and optional parameters """ result = { "type": self.type, "feature_name": self.feature_name, "columns": self.columns, } if self.parameters is not None: result["parameters"] = self.parameters.model_dump(exclude_none=True) return result
[docs] class FeatureEngineeringIdeas(BaseModel): """ Schema for a list of feature engineering ideas generated by the LLM. This is the top-level schema used with LangChain's with_structured_output(). """ ideas: List[FeatureEngineeringIdea] = Field( ..., description="List of feature engineering ideas", )
[docs] def to_executor_config(self) -> dict: """ Convert to a configuration dict compatible with TransformationPipeline.from_dict(). Returns ------- dict Dictionary with 'transformations' key containing list of transformation configs """ return {"transformations": [idea.to_executor_dict() for idea in self.ideas]}