"""
Pydantic models for data validation and serialization.
"""
from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, ConfigDict, Field, model_validator
from skfeaturellm.transformations import (
get_all_operation_types,
get_binary_operation_types,
get_unary_operation_types,
)
[docs]
class FeatureDescription(BaseModel):
"""
Schema for describing a single feature in the dataset.
Attributes:
name: The name of the feature
type: The data type of the feature (e.g., 'int', 'float', 'str', 'datetime')
description: A description of what the feature represents
"""
name: str = Field(..., description="Name of the feature")
type: str = Field(..., description="Data type of the feature")
description: str = Field(
..., description="Description of what the feature represents"
)
[docs]
class FeatureDescriptions(BaseModel):
"""
Schema for a collection of feature descriptions.
Attributes:
features: List of feature descriptions
"""
features: List[FeatureDescription] = Field(
..., description="List of feature descriptions"
)
# =============================================================================
# Feature Engineering Ideas (LLM Output Schema)
# =============================================================================
[docs]
class FeatureEngineeringIdea(BaseModel):
"""
Schema for a feature engineering idea generated by the LLM.
This schema is designed to map directly to the transformation executor,
ensuring that LLM output can be reliably executed.
Supports both binary operations (add, sub, mul, div) and unary operations
(log, sqrt, abs, etc.).
Attributes:
type: The transformation type
feature_name: Name for the resulting feature
description: Explanation of what the feature represents and why it's useful
columns: List of column names required for the transformation
parameters: Optional dictionary of additional parameters (e.g., constants)
Examples:
Unary operation (log):
>>> FeatureEngineeringIdea(
... type="log",
... feature_name="log_income",
... description="Log of income to reduce skewness",
... columns=["income"]
... )
Binary operation (division of two columns):
>>> FeatureEngineeringIdea(
... type="div",
... feature_name="income_per_person",
... description="Average income per household member",
... columns=["total_income", "household_size"]
... )
Binary operation (multiply column by constant):
>>> FeatureEngineeringIdea(
... type="mul",
... feature_name="income_doubled",
... description="Income multiplied by 2 for scaling",
... columns=["income"],
... parameters={"constant": 2.0}
... )
"""
type: str = Field(
...,
description="Transformation type (must be a registered transformation)",
)
feature_name: str = Field(
...,
description="Name for the resulting feature",
min_length=1,
)
description: str = Field(
...,
description="Explanation of what the feature represents and why it's useful",
)
columns: List[str] = Field(
...,
description="List of column names required for the transformation",
min_length=1,
)
parameters: Optional[TransformationParameters] = Field(
default=None,
description="Optional parameters for the transformation",
)
[docs]
@model_validator(mode="after")
def validate_operands(self) -> "FeatureEngineeringIdea":
"""Validate operands based on transformation type."""
all_ops = get_all_operation_types()
unary_ops = get_unary_operation_types()
binary_ops = get_binary_operation_types()
# Validate that type is registered
if self.type not in all_ops:
raise ValueError(
f"Unknown transformation type '{self.type}'. "
f"Registered types: {sorted(all_ops)}"
)
if self.type in unary_ops:
# Unary operations require exactly 1 column
if len(self.columns) != 1:
raise ValueError(
f"Unary operation '{self.type}' requires exactly 1 column, got {len(self.columns)}"
)
elif self.type in binary_ops:
# Binary operations require 1 or 2 columns
if len(self.columns) == 1:
# Column-constant operation: must have constant in parameters
if self.parameters is None or self.parameters.constant is None:
raise ValueError(
f"Binary operation '{self.type}' with 1 column requires 'constant' in parameters"
)
elif len(self.columns) == 2:
# Column-column operation: should not have constant in parameters
if self.parameters is not None and self.parameters.constant is not None:
raise ValueError(
f"Binary operation '{self.type}' with 2 columns should not have 'constant' in parameters"
)
else:
raise ValueError(
f"Binary operation '{self.type}' requires 1 or 2 columns, got {len(self.columns)}"
)
return self
[docs]
def to_executor_dict(self) -> dict:
"""
Convert to a dictionary format compatible with TransformationPipeline.
Returns
-------
dict
Dictionary with type, feature_name, columns, and optional parameters
"""
result = {
"type": self.type,
"feature_name": self.feature_name,
"columns": self.columns,
}
if self.parameters is not None:
result["parameters"] = self.parameters.model_dump(exclude_none=True)
return result
[docs]
class FeatureEngineeringIdeas(BaseModel):
"""
Schema for a list of feature engineering ideas generated by the LLM.
This is the top-level schema used with LangChain's with_structured_output().
"""
ideas: List[FeatureEngineeringIdea] = Field(
...,
description="List of feature engineering ideas",
)
[docs]
def to_executor_config(self) -> dict:
"""
Convert to a configuration dict compatible with TransformationPipeline.from_dict().
Returns
-------
dict
Dictionary with 'transformations' key containing list of transformation configs
"""
return {"transformations": [idea.to_executor_dict() for idea in self.ideas]}