import logging
import glob
import tempfile
import os.path
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
import src.autofeatinsights.functions.relationship_functions as relationship_functions
import src.autofeatinsights.functions.tree_functions as tree_functions
import src.autofeatinsights.functions.feature_functions as feature_functions
import src.autofeatinsights.functions.evaluation_functions as evaluation_functions
from src.autofeatinsights.functions.helper_functions import RelevanceRedundancy, get_df_with_prefix
from typing import List, Set
from src.autofeatinsights.functions.classes import Weight, Tree, Result
import pandas as pd
from sklearn.model_selection import train_test_split
logging.getLogger().setLevel(logging.WARNING)
[docs]
class FeatureDiscovery:
targetColumn: str
threshold: float
paths: [Tree]
weights: [Weight]
results: [Result]
base_dataset: str
partial_join: pd.DataFrame
extra_tables: [(str, str)]
exlude_tables: [(str, str)]
partial_join_selected_features: dict = {}
join_keys: dict = {}
explore: bool
non_null_ratio_threshold: float
def __init__(self):
# self.base_dataset = base_dataset
# self.weight_treshold = weight_treshold
self.datasets = []
self.weights = []
self.results = []
self.discovered: Set[str] = set()
self.extra_tables = []
self.exlude_tables = []
self.definite_features = []
self.exclude_features = []
self.temp_dir = tempfile.TemporaryDirectory()
# self.explore = explore
# self.top_k = top_k
# self.rel_red = RelevanceRedundancy(targetColumn, jmi=jmi, pearson=pearson)
self.trees = []
self.join_name_mapping = {}
[docs]
def set_base_table(self, base_table: str, target_column: str):
"""
Sets the base table and target column for feature generation.
Args:
base_table (str): The name of the base table.
target_column (str): The name of the target column.
Returns:
None
"""
self.base_table = base_table
self.targetColumn = target_column
X_train = get_df_with_prefix(self.base_table, self.targetColumn)
self.partial_join = X_train.copy()
features = list(X_train.columns)
features.remove(target_column)
self.partial_join_selected_features[str(base_table)] = features
self.join_keys[str(base_table)] = []
self.tree_hash = {}
self.rel_red = RelevanceRedundancy(target_column)
[docs]
def set_dataset_repository(self, dataset_repository: List[str] = [], all_tables: bool = False):
"""
Sets the dataset repository for the AutofeatClass object.
Parameters:
- dataset_repository (List[str]): A list of dataset paths.
- all_tables (bool): Flag indicating whether to use all tables in the repository.
Raises:
- Exception: If both dataset_repository and all_tables are specified.
- Exception: If neither dataset_repository nor all_tables are specified.
"""
if len(dataset_repository) > 0 and all_tables:
raise Exception("You can't set all_tables to True and specify a dataset repository.")
if len(dataset_repository) == 0 and not all_tables:
raise Exception("You need to specify a dataset repository or set all_tables to True.")
if all_tables:
datasets = [i.split("/")[-1] for i in glob.glob("data/benchmark/*")]
self.datasets = datasets
else:
self.datasets = dataset_repository
[docs]
def get_tables_repository(self):
"""
Retrieves the tables from the repository.
Returns:
tables (list): A list of table paths.
"""
tables = []
for dataset in self.datasets:
for table in glob.glob("data/benchmark/" + dataset + "/*.csv"):
tables.append((table.split("/")[-2]) + "/" + table.split("/")[-1])
return tables
[docs]
def add_table(self, table: str):
"""
Adds an extra table to the list of tables used for feature generation.
Args:
table (str): The name of the table to be added.
"""
self.extra_tables.append(table)
if self.relationship_threshold is not None and self.matcher is not None:
relationship_functions.rerun(self, self.relationship_threshold, self.matcher)
[docs]
def remove_table(self, table: str):
"""
Removes a table from the list of extra tables and adds it to the list of excluded tables.
Args:
table (str): The name of the table to be removed.
"""
if table in self.extra_tables:
self.extra_tables.remove(table)
self.exclude_tables.append(table)
if self.relationship_threshold is not None and self.matcher is not None:
relationship_functions.rerun(self, self.relationship_threshold, self.matcher)
[docs]
def get_weights_from_table(self, table: str):
"""
Returns a list of weights from the specified table.
Args:
table (str): The name of the table.
Returns:
list: A list of weights from the specified table.
"""
return [i for i in self.weights if i.from_table == table]
[docs]
def get_weights_from_and_to_table(self, from_table, to_table):
"""
Returns a list of weights that have the specified 'from_table' and 'to_table' values.
Parameters:
from_table (str): The source table name.
to_table (str): The destination table name.
Returns:
list: A list of weights that match the specified 'from_table' and 'to_table' values.
"""
return [i for i in self.weights if i.from_table == from_table and i.to_table == to_table]
[docs]
def find_relationships(self, matcher="coma", relationship_threshold: float = 0.5, explain=False,
use_cache=True, verbose=True):
"""
Finds relationships between features in the dataset.
Args:
matcher (str, optional): The name of the matcher to use for finding relationships. Defaults to "coma".
relationship_threshold (float, optional): The threshold value for determining the strength of a relationship. Defaults to 0.5.
explain (bool, optional): Whether to provide an explanation for the relationships found. Defaults to False.
use_cache (bool, optional): Whether to use a cache for storing previously computed relationships. Defaults to True.
verbose (bool, optional): Whether to print verbose output during the process. Defaults to True.
"""
self.matcher = matcher
self.relation_threshold = relationship_threshold
relationship_functions.find_relationships(self, relationship_threshold, matcher, explain,
use_cache=use_cache, verbose=verbose)
[docs]
def read_relationships(self, file_path):
"""
Reads the relationships from a file and updates the object's internal state.
Args:
file_path (str): The path to the file containing the relationships.
"""
relationship_functions.read_relationships(self, file_path)
[docs]
def display_best_relationships(self):
"""
Displays the best relationships found by FeatureDiscovery.
"""
relationship_functions.display_best_relationships(self)
[docs]
def add_relationship(self, table1: str, col1: str, table2: str, col2: str, weight: float):
"""
Adds a relationship between two columns in different tables.
Args:
table1 (str): The name of the first table.
col1 (str): The name of the column in the first table.
table2 (str): The name of the second table.
col2 (str): The name of the column in the second table.
weight (float): The weight of the relationship.
"""
relationship_functions.add_relationship(self, table1, col1, table2, col2, weight)
[docs]
def remove_relationship(self, table1: str, col1: str, table2: str, col2: str):
"""
Removes a relationship between two columns in different tables.
Args:
table1 (str): The name of the first table.
col1 (str): The name of the column in the first table.
table2 (str): The name of the second table.
col2 (str): The name of the column in the second table.
"""
relationship_functions.remove_relationship(self, table1, col1, table2, col2)
[docs]
def update_relationship(self, table1: str, col1: str, table2: str, col2: str, weight: float):
"""
Update the relationship between two tables and their respective columns with a given weight.
Args:
table1 (str): The name of the first table.
col1 (str): The name of the column in the first table.
table2 (str): The name of the second table.
col2 (str): The name of the column in the second table.
weight (float): The weight of the relationship.
"""
relationship_functions.update_relationship(self, table1, col1, table2, col2, weight)
[docs]
def display_table_relationship(self, table1: str, table2: str):
"""
Display the relationship between two tables.
Args:
table1 (str): The name of the first table.
table2 (str): The name of the second table.
"""
relationship_functions.display_table_relationship(self, table1, table2)
[docs]
def compute_join_trees(self, top_k_features: int = 10, non_null_threshold=0.5, explain=False, verbose=True):
"""
Compute join trees for feature selection.
Args:
top_k_features (int): Number of top features to select. Defaults to 10.
non_null_threshold (float): Threshold for non-null ratio. Defaults to 0.5.
explain (bool): Whether to explain the join trees. Defaults to False.
verbose (bool): Whether to print verbose output. Defaults to True.
"""
tree_functions.compute_join_trees(self, top_k_features, non_null_ratio_threshold=non_null_threshold,
explain=explain, verbose=verbose)
[docs]
def show_features(self, tree_id: int, show_discarded_features: bool = False):
"""
Display the features for a given tree ID.
Args:
tree_id (int): The ID of the tree.
show_discarded_features (bool): Whether to show discarded features or not. Default is False.
"""
feature_functions.show_features(self, tree_id, show_discarded_features)
[docs]
def display_join_trees(self, top_k: int = None):
"""
Display the join trees for the AutoFeatClass instance.
Args:
top_k (int): The number of join trees to display. If None, display all join trees.
"""
tree_functions.display_join_trees(self, top_k)
[docs]
def display_join_tree(self, tree_id):
"""
Display the join path with the given tree_id.
Parameters:
- tree_id: The ID of the join path to display.
"""
tree_functions.display_join_tree(self, tree_id)
[docs]
def explain_relationship(self, table1: str, table2: str):
"""
Explains the relationship between two tables.
Args:
table1 (str): The name of the first table.
table2 (str): The name of the second table.
"""
relationship_functions.explain_relationship(self, table1, table2)
[docs]
def explain_tree(self, tree_id: int):
"""
Explain the tree identified by the given tree_id.
Args:
tree_id (int): The ID of the tree to explain.
"""
tree_functions.explain_tree(self, tree_id)
[docs]
def remove_join_path_from_tree(self, tree_id: int, table: str):
"""
Removes a join path from the tree.
Args:
tree_id (int): The ID of the tree.
table (str): The name of the table to remove the join path from.
"""
tree_functions.remove_join_from_tree(self, tree_id, table)
[docs]
def explain_result(self, tree_id: int, model: str = 'GBM'):
"""
Explain the result of a specific tree in the AutoFeat pipeline.
Args:
tree_id (int): The ID of the tree to explain.
model (str, optional): The model to use for explanation. Defaults to 'GBM'.
"""
evaluation_functions.explain_result(self, tree_id, model)
[docs]
def inspect_join_tree(self, tree_id: int):
"""
Inspects the join tree with the given tree_id.
Parameters:
tree_id (int): The ID of the join tree to inspect.
"""
tree_functions.inspect_join_tree(self, tree_id)
[docs]
def evaluate_trees(self, algorithm='GBM', top_k_paths: int = 3, verbose=True, explain=False):
"""
Evaluate the performance of the generated trees.
Parameters:
- algorithm (str): The algorithm to use for evaluation. Default is 'GBM'.
- top_k_paths (int): The number of top paths to consider. Default is 3.
- verbose (bool): Whether to print verbose output. Default is True.
- explain (bool): Whether to explain the evaluation results. Default is False.
"""
evaluation_functions.evalute_trees(self, algorithm, top_k_paths, verbose=verbose, explain=explain)
[docs]
def get_best_result(self):
"""
Returns the best result obtained by the evaluation module.
"""
return evaluation_functions.get_best_result(self)
[docs]
def evaluate_augmented_table(self, tree_id: int, algorithm='GBM', verbose=False):
"""
Evaluate the augmented table using the specified algorithm and tree ID.
Parameters:
- tree_id (int): The ID of the tree to use for evaluation.
- algorithm (str): The algorithm to use for evaluation. Default is 'GBM'.
- verbose (bool): Whether to print verbose output. Default is False.
"""
evaluation_functions.evaluate_table(self, algorithm, tree_id, verbose)
evaluation_functions.explain_result(self, tree_id, algorithm)
[docs]
def adjust_relevance_value(self, tree_id: int, feature: str, value: float):
"""
Adjusts the relevance value of a feature for a specific tree.
Args:
tree_id (int): The ID of the tree.
feature (str): The name of the feature.
value (float): The new relevance value.
Returns:
None
"""
feature_functions.adjust_relevance_value(self, tree_id, feature, value)
[docs]
def adjust_redundancy_value(self, tree_id: int, feature: str, value: float):
"""
Adjusts the redundancy value for a specific feature in a given tree.
Args:
tree_id (int): The ID of the tree.
feature (str): The name of the feature.
value (float): The new redundancy value.
"""
feature_functions.adjust_redundancy_value(self, tree_id, feature, value)
[docs]
def adjust_non_null_ratio(self, tree_id: int, table: str, value: float):
"""
Adjusts the non-null ratio for a specific tree and table.
Args:
tree_id (int): The ID of the tree.
table (str): The name of the table.
value (float): The new non-null ratio value.
"""
feature_functions.adjust_non_null_ratio(self, tree_id, table, value)
[docs]
def move_features_to_discarded(self, tree_id: int, features: [str]):
"""
Moves the specified features to the discarded list for the given tree.
Args:
tree_id (int): The ID of the tree.
features (list[str]): The list of features to be moved to the discarded list.
"""
feature_functions.move_features_to_discarded(self, tree_id, features)
[docs]
def move_features_to_selected(self, tree_id: int, features: [str]):
"""
Moves the specified features from discarded to the selected features list for the given tree.
Args:
tree_id (int): The ID of the tree.
features (list[str]): The list of features to be moved.
"""
feature_functions.move_features_to_selected(self, tree_id, features)
[docs]
def materialise_join_tree(self, tree_id: int):
"""
Materializes the join tree with the given tree_id.
Args:
tree_id (int): The ID of the join tree to materialize.
Returns:
The materialized join tree.
"""
return tree_functions.materialise_join_tree(self, tree_id)
[docs]
def augment_dataset(self, algorithm="GBM", relation_threshold: float = 0.5, non_null_threshold=0.5, matcher="coma",
top_k_features: int = 10,
top_k_paths: int = 3, explain=True, verbose=True, use_cache=True):
"""
Augments the dataset by finding relationships between features, computing join trees, and evaluating the trees.
Args:
algorithm (str): The algorithm to use for tree evaluation. Default is "GBM".
relation_threshold (float): The threshold for considering a relationship between features. Default is 0.5.
non_null_threshold: The threshold for considering a feature as non-null. Default is 0.5.
matcher (str): The matcher to use for finding relationships. Default is "coma".
top_k_features (int): The number of top features to select. Default is 10.
top_k_paths (int): The number of top paths to select. Default is 3.
explain (bool): Whether to explain the process. Default is True.
verbose (bool): Whether to print verbose output. Default is True.
use_cache (bool): Whether to use cached relationship weights. Default is True.
"""
if use_cache:
if os.path.isfile(f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt"):
if verbose:
print("Reading from cache file: " + f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt")
self.read_relationships(f"saved_weights/{self.base_table}_{relation_threshold}_{matcher}_weights.txt")
else:
self.find_relationships(relationship_threshold=relation_threshold, matcher=matcher,
explain=explain, verbose=verbose)
else:
self.find_relationships(relationship_threshold=relation_threshold, matcher=matcher,
explain=explain, verbose=verbose)
self.compute_join_trees(top_k_features=top_k_features, explain=explain, non_null_threshold=non_null_threshold,
verbose=verbose)
print(self.trees)
self.evaluate_trees(algorithm=algorithm, top_k_paths=top_k_paths, explain=explain)
if __name__ == "__main__":
autofeat = FeatureDiscovery()
autofeat.set_base_table(base_table="school/base.csv", target_column="class")
autofeat.set_dataset_repository(dataset_repository=["school"])
autofeat.augment_dataset(non_null_threshold=0.65, top_k_paths=30, algorithm="LR", top_k_features=15)
# autofeat.read_relationships("saved_weights/school/base.csv_0.5_coma_weights.txt")
# autofeat.compute_join_trees(top_k_features=5)
# autofeat.display_join_path(2)
# autofeat.augment_dataset(explain=True)
# autofeat.read_relationships()
# autofeat.compute_join_paths(top_k_features=5)
# autofeat.display_join_path(1)
# # autofeat.show_features(1, show_discarded_features=True)
# df = autofeat.materialise_join_path(1)
# print(df)
# autofeat.update_relationship(table1="school_best/base.csv", col1="DBN", table2="school_best/qr.csv", col2="DBN",
# weight=0.2)
# autofeat.find_relationships(relationship_threshold=0.8, matcher="jaccard")
# autofeat.add_table("school_best/")
# autofeat.read_relationships()
# autofeat.display_best_relationships()
# autofeat.display_table_relationship("credit/table_0_0.csv", "credit/table_1_1.csv")
# autofeat.explain_relationship("credit/table_0_0.csv", "credit/table_1_1.csv")
# autofeat.compute_join_paths()
# autofeat.show_features(1, show_discarded_features=True)
# autofeat.move_feature_to_discarded(1, "credit/table_1_1.csv.other_parties")
# # autofeat.adjust_relevance_value(1, "credit/table_1_1.csv.other_parties", 0.5)
# # autofeat.adjust_null_ratio(1, "credit/table_1_1.csv", 0.5)
# autofeat.show_features(1, show_discarded_features=True)
# autofeat.move_feature_to_selected(1, "credit/table_1_1.csv.other_parties")
# autofeat.show_features(1, show_discarded_features=True)
# autofeat.inspect_join_path(2)
# autofeat.show_features(path_id=3, show_discarded_features=True)
# autofeat.display_join_paths(top_k=2)
# df = autofeat.materialise_join_path(path_id=1)
# print(list(df.columns))
# autofeat.evaluate_paths(top_k_paths=2)
# autofeat.add_relationship("credit/table_0_0.csv", "residence_since", "credit/table_1_1.csv", "housing", 0.8)