Source code for src.autofeatinsights.autofeat_class

import logging
import glob
import tempfile
import os.path
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
import src.autofeatinsights.functions.relationship_functions as relationship_functions
import src.autofeatinsights.functions.tree_functions as tree_functions
import src.autofeatinsights.functions.feature_functions as feature_functions
import src.autofeatinsights.functions.evaluation_functions as evaluation_functions
from src.autofeatinsights.functions.helper_functions import RelevanceRedundancy, get_df_with_prefix
from typing import List, Set
from src.autofeatinsights.functions.classes import Weight, Tree, Result
import pandas as pd
from sklearn.model_selection import train_test_split
logging.getLogger().setLevel(logging.WARNING)



[docs]
class FeatureDiscovery:
    targetColumn: str
    threshold: float
    paths: [Tree]
    weights: [Weight]
    results: [Result]
    base_dataset: str
    partial_join: pd.DataFrame
    extra_tables: [(str, str)]
    exlude_tables: [(str, str)]
    partial_join_selected_features: dict = {}
    join_keys: dict = {}
    explore: bool
    non_null_ratio_threshold: float

    def __init__(self):

        # self.base_dataset = base_dataset
        # self.weight_treshold = weight_treshold
        self.datasets = []
        self.weights = []
        self.results = []
        self.discovered: Set[str] = set()
        self.extra_tables = []
        self.exlude_tables = []
        self.definite_features = []
        self.exclude_features = []
        self.temp_dir = tempfile.TemporaryDirectory()
        # self.explore = explore
        # self.top_k = top_k
        # self.rel_red = RelevanceRedundancy(targetColumn, jmi=jmi, pearson=pearson)
        self.trees = []
        self.join_name_mapping = {}


[docs]
    def set_base_table(self, base_table: str, target_column: str):
        """
        Sets the base table and target column for feature generation.

        Args:
            base_table (str): The name of the base table.
            target_column (str): The name of the target column.

        Returns:
            None
        """
        self.base_table = base_table
        self.targetColumn = target_column
        X_train = get_df_with_prefix(self.base_table, self.targetColumn)
        self.partial_join = X_train.copy()
        features = list(X_train.columns)
        features.remove(target_column)
        self.partial_join_selected_features[str(base_table)] = features
        self.join_keys[str(base_table)] = []
        self.tree_hash = {}
        self.rel_red = RelevanceRedundancy(target_column)



[docs]
    def set_dataset_repository(self, dataset_repository: List[str] = [], all_tables: bool = False):
        """
        Sets the dataset repository for the AutofeatClass object.

        Parameters:
        - dataset_repository (List[str]): A list of dataset paths.
        - all_tables (bool): Flag indicating whether to use all tables in the repository.

        Raises:
        - Exception: If both dataset_repository and all_tables are specified.
        - Exception: If neither dataset_repository nor all_tables are specified.

        """
        if len(dataset_repository) > 0 and all_tables:
            raise Exception("You can't set all_tables to True and specify a dataset repository.")
        if len(dataset_repository) == 0 and not all_tables:
            raise Exception("You need to specify a dataset repository or set all_tables to True.")
        if all_tables:
            datasets = [i.split("/")[-1] for i in glob.glob("data/benchmark/*")]
            self.datasets = datasets
        else:
            self.datasets = dataset_repository



[docs]
    def get_tables_repository(self):
        """
        Retrieves the tables from the repository.

        Returns:
            tables (list): A list of table paths.
        """
        tables = []
        for dataset in self.datasets:
            for table in glob.glob("data/benchmark/" + dataset + "/*.csv"):
                tables.append((table.split("/")[-2]) + "/" + table.split("/")[-1])
        return tables



[docs]
    def add_table(self, table: str):
        """
        Adds an extra table to the list of tables used for feature generation.

        Args:
            table (str): The name of the table to be added.
        """
        self.extra_tables.append(table)
        if self.relationship_threshold is not None and self.matcher is not None:
            relationship_functions.rerun(self, self.relationship_threshold, self.matcher)



[docs]
    def remove_table(self, table: str):
        """
        Removes a table from the list of extra tables and adds it to the list of excluded tables.
        
        Args:
            table (str): The name of the table to be removed.
        """
        if table in self.extra_tables:
            self.extra_tables.remove(table)
        self.exclude_tables.append(table)
        if self.relationship_threshold is not None and self.matcher is not None:
            relationship_functions.rerun(self, self.relationship_threshold, self.matcher)



[docs]
    def get_weights_from_table(self, table: str):
        """
        Returns a list of weights from the specified table.

        Args:
            table (str): The name of the table.

        Returns:
            list: A list of weights from the specified table.
        """
        return [i for i in self.weights if i.from_table == table]

    

[docs]
    def get_weights_from_and_to_table(self, from_table, to_table):
        """
        Returns a list of weights that have the specified 'from_table' and 'to_table' values.

        Parameters:
            from_table (str): The source table name.
            to_table (str): The destination table name.

        Returns:
            list: A list of weights that match the specified 'from_table' and 'to_table' values.
        """
        return [i for i in self.weights if i.from_table == from_table and i.to_table == to_table]



[docs]
    def find_relationships(self, matcher="coma", relationship_threshold: float = 0.5, explain=False, 
                           use_cache=True, verbose=True):
        """
        Finds relationships between features in the dataset.

        Args:
            matcher (str, optional): The name of the matcher to use for finding relationships. Defaults to "coma".
            relationship_threshold (float, optional): The threshold value for determining the strength of a relationship. Defaults to 0.5.
            explain (bool, optional): Whether to provide an explanation for the relationships found. Defaults to False.
            use_cache (bool, optional): Whether to use a cache for storing previously computed relationships. Defaults to True.
            verbose (bool, optional): Whether to print verbose output during the process. Defaults to True.
        """
        self.matcher = matcher
        self.relation_threshold = relationship_threshold
        relationship_functions.find_relationships(self, relationship_threshold, matcher, explain, 
                                                  use_cache=use_cache, verbose=verbose)



[docs]
    def read_relationships(self, file_path):
        """
        Reads the relationships from a file and updates the object's internal state.

        Args:
            file_path (str): The path to the file containing the relationships.
        """
        relationship_functions.read_relationships(self, file_path)



[docs]
    def display_best_relationships(self):
        """
        Displays the best relationships found by  FeatureDiscovery.
        """
        relationship_functions.display_best_relationships(self)



[docs]
    def add_relationship(self, table1: str, col1: str, table2: str, col2: str, weight: float):
        """
        Adds a relationship between two columns in different tables.

        Args:
            table1 (str): The name of the first table.
            col1 (str): The name of the column in the first table.
            table2 (str): The name of the second table.
            col2 (str): The name of the column in the second table.
            weight (float): The weight of the relationship.
        """
        relationship_functions.add_relationship(self, table1, col1, table2, col2, weight)



[docs]
    def remove_relationship(self, table1: str, col1: str, table2: str, col2: str):
        """
        Removes a relationship between two columns in different tables.

        Args:
            table1 (str): The name of the first table.
            col1 (str): The name of the column in the first table.
            table2 (str): The name of the second table.
            col2 (str): The name of the column in the second table.
        """
        relationship_functions.remove_relationship(self, table1, col1, table2, col2)



[docs]
    def update_relationship(self, table1: str, col1: str, table2: str, col2: str, weight: float):
        """
        Update the relationship between two tables and their respective columns with a given weight.

        Args:
            table1 (str): The name of the first table.
            col1 (str): The name of the column in the first table.
            table2 (str): The name of the second table.
            col2 (str): The name of the column in the second table.
            weight (float): The weight of the relationship.
        """
        relationship_functions.update_relationship(self, table1, col1, table2, col2, weight)

    

[docs]
    def display_table_relationship(self, table1: str, table2: str):
        """
        Display the relationship between two tables.

        Args:
            table1 (str): The name of the first table.
            table2 (str): The name of the second table.
        """
        relationship_functions.display_table_relationship(self, table1, table2)



[docs]
    def compute_join_trees(self, top_k_features: int = 10, non_null_threshold=0.5, explain=False, verbose=True):
        """
        Compute join trees for feature selection.

        Args:
            top_k_features (int): Number of top features to select. Defaults to 10.
            non_null_threshold (float): Threshold for non-null ratio. Defaults to 0.5.
            explain (bool): Whether to explain the join trees. Defaults to False.
            verbose (bool): Whether to print verbose output. Defaults to True.
        """
        tree_functions.compute_join_trees(self, top_k_features, non_null_ratio_threshold=non_null_threshold, 
                                          explain=explain, verbose=verbose)



[docs]
    def show_features(self, tree_id: int, show_discarded_features: bool = False):
        """
        Display the features for a given tree ID.

        Args:
            tree_id (int): The ID of the tree.
            show_discarded_features (bool): Whether to show discarded features or not. Default is False.
        """
        feature_functions.show_features(self, tree_id, show_discarded_features)



[docs]
    def display_join_trees(self, top_k: int = None):
        """
        Display the join trees for the AutoFeatClass instance.

        Args:
            top_k (int): The number of join trees to display. If None, display all join trees.
        """
        tree_functions.display_join_trees(self, top_k)

    

[docs]
    def display_join_tree(self, tree_id):
        """
        Display the join path with the given tree_id.

        Parameters:
        - tree_id: The ID of the join path to display.
        """
        tree_functions.display_join_tree(self, tree_id)



[docs]
    def explain_relationship(self, table1: str, table2: str):
        """
        Explains the relationship between two tables.

        Args:
            table1 (str): The name of the first table.
            table2 (str): The name of the second table.
        """
        relationship_functions.explain_relationship(self, table1, table2)

    

[docs]
    def explain_tree(self, tree_id: int):
        """
        Explain the tree identified by the given tree_id.

        Args:
            tree_id (int): The ID of the tree to explain.
        """
        tree_functions.explain_tree(self, tree_id)



[docs]
    def remove_join_path_from_tree(self, tree_id: int, table: str):
        """
        Removes a join path from the tree.

        Args:
            tree_id (int): The ID of the tree.
            table (str): The name of the table to remove the join path from.
        """
        tree_functions.remove_join_from_tree(self, tree_id, table)



[docs]
    def explain_result(self, tree_id: int, model: str = 'GBM'):
        """
        Explain the result of a specific tree in the AutoFeat pipeline.

        Args:
            tree_id (int): The ID of the tree to explain.
            model (str, optional): The model to use for explanation. Defaults to 'GBM'.
        """
        evaluation_functions.explain_result(self, tree_id, model)



[docs]
    def inspect_join_tree(self, tree_id: int):
        """
        Inspects the join tree with the given tree_id.

        Parameters:
            tree_id (int): The ID of the join tree to inspect.
        """
        tree_functions.inspect_join_tree(self, tree_id)



[docs]
    def evaluate_trees(self, algorithm='GBM', top_k_paths: int = 3, verbose=True, explain=False):
        """
        Evaluate the performance of the generated trees.

        Parameters:
        - algorithm (str): The algorithm to use for evaluation. Default is 'GBM'.
        - top_k_paths (int): The number of top paths to consider. Default is 3.
        - verbose (bool): Whether to print verbose output. Default is True.
        - explain (bool): Whether to explain the evaluation results. Default is False.
        """
        evaluation_functions.evalute_trees(self, algorithm, top_k_paths, verbose=verbose, explain=explain)



[docs]
    def get_best_result(self):
        """
        Returns the best result obtained by the evaluation module.
        """
        return evaluation_functions.get_best_result(self)



[docs]
    def evaluate_augmented_table(self, tree_id: int, algorithm='GBM', verbose=False):
        """
        Evaluate the augmented table using the specified algorithm and tree ID.

        Parameters:
        - tree_id (int): The ID of the tree to use for evaluation.
        - algorithm (str): The algorithm to use for evaluation. Default is 'GBM'.
        - verbose (bool): Whether to print verbose output. Default is False.
        """
        evaluation_functions.evaluate_table(self, algorithm, tree_id, verbose)
        evaluation_functions.explain_result(self, tree_id, algorithm)



[docs]
    def adjust_relevance_value(self, tree_id: int, feature: str, value: float):
        """
        Adjusts the relevance value of a feature for a specific tree.

        Args:
            tree_id (int): The ID of the tree.
            feature (str): The name of the feature.
            value (float): The new relevance value.

        Returns:
            None
        """
        feature_functions.adjust_relevance_value(self, tree_id, feature, value)



[docs]
    def adjust_redundancy_value(self, tree_id: int, feature: str, value: float):
        """
        Adjusts the redundancy value for a specific feature in a given tree.

        Args:
            tree_id (int): The ID of the tree.
            feature (str): The name of the feature.
            value (float): The new redundancy value.
        """
        feature_functions.adjust_redundancy_value(self, tree_id, feature, value)



[docs]
    def adjust_non_null_ratio(self, tree_id: int, table: str, value: float):
        """
        Adjusts the non-null ratio for a specific tree and table.

        Args:
            tree_id (int): The ID of the tree.
            table (str): The name of the table.
            value (float): The new non-null ratio value.
        """
        feature_functions.adjust_non_null_ratio(self, tree_id, table, value)



[docs]
    def move_features_to_discarded(self, tree_id: int, features: [str]):
        """
        Moves the specified features to the discarded list for the given tree.

        Args:
            tree_id (int): The ID of the tree.
            features (list[str]): The list of features to be moved to the discarded list.
        """
        feature_functions.move_features_to_discarded(self, tree_id, features)



[docs]
    def move_features_to_selected(self, tree_id: int, features: [str]):
        """
        Moves the specified features from discarded to the selected features list for the given tree.

        Args:
            tree_id (int): The ID of the tree.
            features (list[str]): The list of features to be moved.
        """
        feature_functions.move_features_to_selected(self, tree_id, features)



[docs]
    def materialise_join_tree(self, tree_id: int):
        """
        Materializes the join tree with the given tree_id.

        Args:
            tree_id (int): The ID of the join tree to materialize.

        Returns:
            The materialized join tree.
        """
        return tree_functions.materialise_join_tree(self, tree_id)



[docs]
    def augment_dataset(self, algorithm="GBM", relation_threshold: float = 0.5, non_null_threshold=0.5, matcher="coma", 
                        top_k_features: int = 10, 
                        top_k_paths: int = 3, explain=True, verbose=True, use_cache=True):
        """
        Augments the dataset by finding relationships between features, computing join trees, and evaluating the trees.
        
        Args:
            algorithm (str): The algorithm to use for tree evaluation. Default is "GBM".
            relation_threshold (float): The threshold for considering a relationship between features. Default is 0.5.
            non_null_threshold: The threshold for considering a feature as non-null. Default is 0.5.
            matcher (str): The matcher to use for finding relationships. Default is "coma".
            top_k_features (int): The number of top features to select. Default is 10.
            top_k_paths (int): The number of top paths to select. Default is 3.
            explain (bool): Whether to explain the process. Default is True.
            verbose (bool): Whether to print verbose output. Default is True.
            use_cache (bool): Whether to use cached relationship weights. Default is True.
        """
        if use_cache:
            if os.path.isfile(f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt"):
                if verbose:
                    print("Reading from cache file: " + f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt")
                    self.read_relationships(f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt")
            else:
                self.find_relationships(relationship_threshold=relation_threshold, matcher=matcher, 
                                        explain=explain, verbose=verbose)
        else:
            self.find_relationships(relationship_threshold=relation_threshold, matcher=matcher, 
                                    explain=explain, verbose=verbose)
        self.compute_join_trees(top_k_features=top_k_features, explain=explain, non_null_threshold=non_null_threshold, 
                                verbose=verbose)
        print(self.trees)
        self.evaluate_trees(algorithm=algorithm, top_k_paths=top_k_paths, explain=explain)




if __name__ == "__main__":

    autofeat = FeatureDiscovery()
    autofeat.set_base_table(base_table="school/base.csv", target_column="class")
    autofeat.set_dataset_repository(dataset_repository=["school"])
    autofeat.augment_dataset(non_null_threshold=0.65, top_k_paths=30, algorithm="LR", top_k_features=15)
    # autofeat.read_relationships("saved_weights/school/base.csv_0.5_coma_weights.txt")
    # autofeat.compute_join_trees(top_k_features=5)
    # autofeat.display_join_path(2)
    # autofeat.augment_dataset(explain=True)
    # autofeat.read_relationships()
    # autofeat.compute_join_paths(top_k_features=5)
    # autofeat.display_join_path(1)
    # # autofeat.show_features(1, show_discarded_features=True)
    # df = autofeat.materialise_join_path(1)
    # print(df)
# autofeat.update_relationship(table1="school_best/base.csv", col1="DBN", table2="school_best/qr.csv", col2="DBN", 
    # weight=0.2)
    # autofeat.find_relationships(relationship_threshold=0.8, matcher="jaccard")
    # autofeat.add_table("school_best/")
    # autofeat.read_relationships()
    # autofeat.display_best_relationships()
    # autofeat.display_table_relationship("credit/table_0_0.csv", "credit/table_1_1.csv")
    # autofeat.explain_relationship("credit/table_0_0.csv", "credit/table_1_1.csv")
    # autofeat.compute_join_paths()
    # autofeat.show_features(1, show_discarded_features=True)
    # autofeat.move_feature_to_discarded(1, "credit/table_1_1.csv.other_parties")
    # # autofeat.adjust_relevance_value(1, "credit/table_1_1.csv.other_parties", 0.5)
    # # autofeat.adjust_null_ratio(1, "credit/table_1_1.csv", 0.5)
    # autofeat.show_features(1, show_discarded_features=True)
    # autofeat.move_feature_to_selected(1, "credit/table_1_1.csv.other_parties")
    # autofeat.show_features(1, show_discarded_features=True)
    # autofeat.inspect_join_path(2)
    # autofeat.show_features(path_id=3, show_discarded_features=True)
    # autofeat.display_join_paths(top_k=2)
    # df = autofeat.materialise_join_path(path_id=1)
    # print(list(df.columns))
    # autofeat.evaluate_paths(top_k_paths=2)
    # autofeat.add_relationship("credit/table_0_0.csv", "residence_since", "credit/table_1_1.csv", "housing", 0.8)