Source code for skfeaturellm.feature_engineering_transformer

"""
FeatureEngineeringTransformer: a scikit-learn-compatible transformer backed by a fixed set
of LLM-generated (or manually configured) transformations.

Designed for the production phase after exploration with LLMFeatureEngineer:

    ideas = engineer.fit(X_train, y_train).generated_features
    transformer = engineer.to_transformer()
    pipeline = Pipeline([("features", transformer), ("model", XGBClassifier())])
    pipeline.fit(X_train, y_train)
"""

import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted

from skfeaturellm.transformations import TransformationPipeline


[docs] class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin): """ Scikit-learn-compatible transformer that applies a fixed set of transformations. Unlike LLMFeatureEngineer (which calls an LLM during fit), FeatureEngineeringTransformer is fully deterministic — it receives transformation configs at construction time and simply fits/applies them. This makes it safe to use inside Pipeline, GridSearchCV, cross_val_score, and joblib. Parameters ---------- transformations : list of dict List of transformation config dicts, each with at minimum a "type" key. Same format accepted by TransformationPipeline.from_dict(). feature_prefix : str, default "llm_feat_" Prefix applied to generated feature names. raise_on_error : bool, default False If True, raise on transformation errors. If False, skip with a warning. Attributes ---------- executor_ : TransformationPipeline Fitted executor (available after fit()). feature_names_in_ : list of str Column names seen during fit(). Examples -------- >>> transformer = FeatureEngineeringTransformer( ... transformations=[{"type": "log", "feature_name": "log_income", "columns": ["income"]}] ... ) >>> transformer.fit(X_train).transform(X_test) """ def __init__( self, transformations: Optional[List[Dict[str, Any]]] = None, feature_prefix: str = "llm_feat_", raise_on_error: bool = False, ): self.transformations = transformations or [] self.feature_prefix = feature_prefix self.raise_on_error = raise_on_error
[docs] def fit( self, X: pd.DataFrame, y: Optional[pd.Series] = None ) -> "FeatureEngineeringTransformer": """ Build and fit the transformation executor. Parameters ---------- X : pd.DataFrame Training data. y : pd.Series, optional Ignored; present for sklearn API compatibility. Returns ------- self """ self.feature_names_in_ = list(X.columns) pipeline = TransformationPipeline.from_dict( {"transformations": self.transformations}, raise_on_error=self.raise_on_error, ) pipeline.fit(X) self.pipeline_ = pipeline return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Apply all fitted transformations. Parameters ---------- X : pd.DataFrame Data to transform. Returns ------- pd.DataFrame Copy of X with new feature columns appended. """ check_is_fitted(self) return self.pipeline_.transform(X)
[docs] def get_feature_names_out( self, input_features: Optional[List[str]] = None ) -> np.ndarray: """ Return feature names for the output of transform(). Parameters ---------- input_features : list of str, optional Ignored; original feature names come from feature_names_in_. Returns ------- np.ndarray of str """ check_is_fitted(self) generated = [t.feature_name for t in self.pipeline_.transformations] return np.array(self.feature_names_in_ + generated, dtype=object)
[docs] def save(self, path: Union[str, Path]) -> None: """ Save transformer configuration to a JSON file. Only the constructor parameters are saved (not the fitted state). Call fit() again after loading to restore the fitted executor. Parameters ---------- path : str or Path Destination file path. """ payload = { "transformations": self.transformations, "feature_prefix": self.feature_prefix, "raise_on_error": self.raise_on_error, } Path(path).write_text(json.dumps(payload, indent=2))
[docs] @classmethod def load(cls, path: Union[str, Path]) -> "FeatureEngineeringTransformer": """ Load a FeatureEngineeringTransformer from a JSON file produced by save(). Parameters ---------- path : str or Path Source file path. Returns ------- FeatureEngineeringTransformer An unfitted transformer; call fit() before transforming. """ payload = json.loads(Path(path).read_text()) return cls( transformations=payload["transformations"], feature_prefix=payload["feature_prefix"], raise_on_error=payload["raise_on_error"], )