Source code for skfeaturellm.llm_interface

"""
Module for handling interactions with Language Models.
"""

from typing import Dict, List, Optional, Tuple

from langchain.chat_models import init_chat_model
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate

from skfeaturellm.prompts import FEATURE_ENGINEERING_PROMPT, SELECTION_FEEDBACK_PROMPT
from skfeaturellm.prompts import utils as prompt_utils
from skfeaturellm.schemas import (
    FeatureDescription,
    FeatureDescriptions,
    FeatureEngineeringIdeas,
)
from skfeaturellm.transformations import (
    get_binary_operation_types,
    get_transformation_types_for_prompt,
    get_unary_operation_types,
)
from skfeaturellm.types import ProblemType


[docs] class LLMInterface: """ Interface for interacting with Language Models for feature engineering. Parameters ---------- model_name : str, default="gpt-4o" Name of the model to use **kwargs Additional keyword arguments passed to init_chat_model (e.g., temperature, max_tokens, api_key, etc.) """ def __init__(self, model_name: str = "gpt-4o", **kwargs): # Initialize the base model base_llm = init_chat_model(model=model_name, **kwargs) # Use with_structured_output for reliable structured responses # This uses the provider's native structured output capabilities self.llm = base_llm.with_structured_output(FeatureEngineeringIdeas) # Create prompt template with system and human messages self.prompt_template = ChatPromptTemplate.from_messages( [ ("system", FEATURE_ENGINEERING_PROMPT), ( "human", "Generate feature engineering ideas based on the dataset information provided.", ), ] ) # Chain composition - no output parser needed with structured output self.chain = self.prompt_template | self.llm
[docs] def generate_engineered_features( self, feature_descriptions: List[FeatureDescription], target_description: Optional[str] = None, max_features: Optional[int] = None, problem_type: Optional[ProblemType] = None, dataset_statistics: Optional[str] = None, ) -> FeatureEngineeringIdeas: """ Generate feature engineering ideas. Parameters ---------- feature_descriptions : List[FeatureDescription] Descriptions for input features target_description : Optional[str] Description of the target variable and task max_features : Optional[int] Maximum number of features to generate dataset_statistics : Optional[str] Pre-formatted dataset statistics string Returns ------- FeatureEngineeringIdeas Generated feature engineering ideas """ prompt_context = prompt_utils.generate_prompt_context( feature_descriptions=feature_descriptions, target_description=target_description, problem_type=problem_type, max_features=max_features, dataset_statistics=dataset_statistics, ) return self.chain.invoke(prompt_context)
[docs] def generate_prompt_context( self, feature_descriptions: List[Dict[str, str]], target_description: Optional[str] = None, max_features: Optional[int] = None, problem_type: Optional[ProblemType] = None, dataset_statistics: Optional[str] = None, ) -> str: """ Generate the prompt for the LLM. Parameters ---------- feature_descriptions : List[Dict[str, str]] List of dictionaries containing feature descriptions target_description : Optional[str] Description of the target variable and task max_features : Optional[int] Maximum number of features to generate dataset_statistics : Optional[str] Pre-formatted dataset statistics string from _format_dataset_statistics Returns ------- str Formatted prompt """ # Convert dictionaries to FeatureDescription objects feature_descriptions_list = [ FeatureDescription(**feature) for feature in feature_descriptions ] feature_descriptions_schema = FeatureDescriptions( features=feature_descriptions_list ) problem_type_message = ( f"This is a supervised {problem_type} problem." if problem_type is not None else "Not specified." ) target_description_message = ( target_description if target_description is not None else "Not specified." ) additional_context = ( f"Generate up to {max_features} features." if max_features else "" ) transformation_types = get_transformation_types_for_prompt() unary_types = ", ".join(sorted(get_unary_operation_types())) binary_types = ", ".join(sorted(get_binary_operation_types())) dataset_statistics_message = ( dataset_statistics if dataset_statistics is not None else "Not provided." ) return { "feature_descriptions": feature_descriptions_schema.format(), "problem_type": problem_type_message, "target_description": target_description_message, "dataset_statistics": dataset_statistics_message, "additional_context": additional_context, "transformation_types": transformation_types, "unary_types": unary_types, "binary_types": binary_types, }
[docs] def generate_engineered_features_iterative( self, prompt_context: Dict, conversation_history: List[BaseMessage], feedback_context: Optional[Dict] = None, ) -> Tuple[FeatureEngineeringIdeas, List[BaseMessage]]: """ Generate feature engineering ideas in an iterative conversation. Parameters ---------- prompt_context : Dict Prompt context dict conversation_history : List[BaseMessage] Accumulated conversation messages. Empty on the first round. feedback_context : Optional[Dict] Feedback dict with keys ``selected_features_table``, ``rejected_features_table``, and ``max_features``. Required for rounds after the first. Returns ------- Tuple[FeatureEngineeringIdeas, List[BaseMessage]] The generated ideas and the updated conversation history (input messages + AI response appended). """ if not conversation_history: messages = self.prompt_template.format_messages(**prompt_context) else: feedback_message = SELECTION_FEEDBACK_PROMPT.format(**feedback_context) messages = conversation_history + [HumanMessage(content=feedback_message)] ideas = self.llm.invoke(messages) updated_history = messages + [AIMessage(content=ideas.model_dump_json())] return ideas, updated_history