| Current Path : /home/deltalab/PMS/recommendations/recomsys-datapreparation-batch/_library/ |
| Current File : //home/deltalab/PMS/recommendations/recomsys-datapreparation-batch/_library/mine_assRules_utils.py |
from collections import defaultdict
from json import dumps
from pickle import TRUE
from pandas import Timedelta
from PAMI.frequentPattern.basic import FPGrowth
from mlxtend.frequent_patterns import association_rules
from pandas import ExcelWriter, Series
from os import path
import numpy as np
import pandas as pd
def preProcessing_transaction(baskets, item_separator, product_identifier):
col_name = 'Transactions'
# Turnaround: the library wants the this column name :/
transactions_df = baskets.copy()
transactions_df.rename(columns = {product_identifier: 'Transactions'}, inplace = True)
# Turnaround to separate the items in the frequent pairs
transactions_df.loc[:, col_name] = transactions_df[col_name].apply(
func = lambda item_list: [item + item_separator for item in item_list])
return transactions_df
def compute_frequentPatterns(transactions, item_separator, min_support = 0.1, limit_dim_pattern = 5):
data_miner = FPGrowth.FPGrowth(transactions, minSup = min_support)
data_miner.startMine()
print('Runtime:', Timedelta(data_miner.getRuntime(), unit = 'sec'))
# Get the patters as a dataframe
frequentPatterns_df = data_miner.getPatternsAsDataFrame() #.getPatterns()
# Separate the items in each basket
frequentPatterns_df.loc[:, 'Patterns'] = frequentPatterns_df['Patterns'].str.rstrip(item_separator).str.split(item_separator.strip())
# Remove whitespaces within each element within each basket
frequentPatterns_df.loc[:, 'Patterns'] = frequentPatterns_df['Patterns'].apply(
lambda item_list: [item.strip() for item in item_list])
# Count the items
frequentPatterns_df.insert(loc = 1, column = 'Num. pattern items',
value = frequentPatterns_df['Patterns'].apply(lambda items: len(items)))
# Sort the items
frequentPatterns_df.sort_values(by = ['Num. pattern items', 'Support'], ascending = False, inplace = True)
frequentPatterns_df.reset_index(inplace = True, drop = True)
# Compute the true support representation (i.e., in percentage)
total_orders = len(transactions)
frequentPatterns_df.insert(2 ,'PercentageSupport', round(frequentPatterns_df['Support'] / total_orders, 4))
if limit_dim_pattern != -1:
filter_cond = frequentPatterns_df['Num. pattern items'] <= limit_dim_pattern
frequentPatterns_df = frequentPatterns_df[filter_cond]
frequentPatterns_df = frequentPatterns_df.reset_index(drop = True)
return frequentPatterns_df
def generate_associationRules(frequentPatterns, min_confidence, min_lift, verbose = False):
# Mine assocation rules (Condition 1: confidence)
association_rules_df = association_rules(frequentPatterns, metric = "confidence", min_threshold = min_confidence)
# Filter the rules (Condition 2: Lift)
association_rules_df = association_rules_df[association_rules_df['lift'] > min_lift]
# Turning the "frozensets" into "simple lists"
association_rules_df['antecedents'] = association_rules_df['antecedents'].map(list)
association_rules_df['consequents'] = association_rules_df['consequents'].map(list)
# Round numerical values
association_rules_df = association_rules_df.round(decimals = 4)
association_rules_df['lift'] = association_rules_df['lift'].round(2)
# Add dimension of antecedent/consequent basktes
association_rules_df.insert(loc = 2, column='antecedent dimension', value = association_rules_df['antecedents'].map(len))
association_rules_df.insert(loc = 4, column='consequents dimension', value = association_rules_df['consequents'].map(len))
# Sort the values
association_rules_df.sort_values(by = ['antecedent dimension', 'confidence', 'lift', 'support'],
ascending = [True, False, False, False], inplace = True)
# Reset the index
association_rules_df = association_rules_df.reset_index(drop = True)
# Drop unnecessary columns
association_rules_df.drop(columns = ['leverage', 'conviction'], inplace = True)
new_col_order = association_rules_df.columns[:6].tolist() + ['lift', 'confidence', 'support']
association_rules_df = association_rules_df.reindex(columns = new_col_order)
# Visualize the association rules
if len(association_rules_df) > 0:
if verbose:
print(association_rules_df)
else:
print("\nAssociation rules -->", len(association_rules_df), "\n")
else:
print("No association rules available using these parameters"\
f"\n MIN CONFIDENCE: {min_confidence}\nMIN LIFT: {min_lift}")
return association_rules_df
def find_commonLinkedCollections(products_w_linkedCollections, products, force_common_collections = True):
# Retrieve the products linked to the collections
raw_linked_collections = defaultdict(list)
for product in products:
product_collections = products_w_linkedCollections[product]
for collection_name in product_collections:
raw_linked_collections[collection_name].append(product)
# Retrive the common collections
linked_collections = set()
for collection_name, collection_products in raw_linked_collections.items():
if force_common_collections:
if collection_products == products:
linked_collections.add(collection_name)
else:
linked_collections.add(collection_name)
return sorted(list(linked_collections))
def assRules_extractLinkedCollections(associationRules, grouped_collections, orders, product_identifier,
assRules_colNames, name_mapping):
collection_divider = '|'
# Keep only the relevant columns
unnecessary_columns = ['antecedent support', 'consequent support', 'consequent support']
enhancedAssociationRules_df = associationRules.drop(columns = unnecessary_columns).copy()
for col_name in assRules_colNames:
for linked_type in grouped_collections.keys():
# Retrieve the link name within the dataset
link_name = name_mapping[linked_type]
# Extract linked collections for all products
df = orders[[product_identifier] + [link_name] + ['Transaction id']].copy()
df = df.sort_values(by = 'Transaction id', ascending = False).drop(columns = 'Transaction id')
df = df.dropna(subset = [product_identifier])
raw_products_w_linkedCollections = df.to_dict(orient = 'records')
# Turn the list into a dictionary
products_w_linkedCollections = defaultdict(set)
for order in raw_products_w_linkedCollections:
# Get info
collection_string = order[link_name]
item_name = order[product_identifier]
if isinstance(collection_string, str):
linked_collections = collection_string.split(collection_divider)
products_w_linkedCollections[item_name].update(linked_collections)
# Extract collections for each product
new_col_name = col_name.upper() + ' - ' + link_name
enhancedAssociationRules_df[new_col_name] = enhancedAssociationRules_df[col_name].apply(
lambda products : find_commonLinkedCollections(products_w_linkedCollections, products))
return enhancedAssociationRules_df
def filter_enhancedAssRules(enhancedAssociationRules_df, grouped_collections, assRules_colNames, name_mapping, keep_all_cols = False):
redundant_cols = set()
for idk, linked_type in enumerate(grouped_collections.keys()):
# Build column names
link_name = name_mapping[linked_type]
col_names = (assRules_colNames[0].upper() + ' - ' + link_name,
assRules_colNames[1].upper() + ' - ' + link_name)
redundant_cols.update(col_names)
# Find common collections
common_collections = enhancedAssociationRules_df.apply(
func = lambda df_row: np.intersect1d(
np.array(df_row[col_names[0]]),
np.array(df_row[col_names[1]])),
axis = 1
)
# Turnaround to turn empty list into NaN values
common_collections = common_collections.apply(lambda item_list: item_list if len(item_list) > 0 else np.nan)
# Insert the the values as a column
enhancedAssociationRules_df.insert(loc = 4 + idk, column = 'Common' + ' - ' + link_name, value = common_collections)
# Drop items without any linked colletions
enhancedAssociationRules_df = enhancedAssociationRules_df.dropna(
subset = list(map(lambda name: 'Common - ' + name, name_mapping.values())),
how = 'all').reset_index(drop = True)
# Sort values
enhancedAssociationRules_df.sort_values(by = ['antecedent dimension', 'lift', 'confidence', 'support'],
ascending = [True, False, False, False], inplace = True)
# Drop unneccesary column
if not keep_all_cols:
enhancedAssociationRules_df.drop(columns = redundant_cols, inplace = True)
print("\nEnhanced association rules -->", len(enhancedAssociationRules_df), "\n")
return enhancedAssociationRules_df
def generate_saving_fileName(orders, product_identifier, timestamp_col = None, base_file_name = "CoPurchases"):
if timestamp_col == None or timestamp_col not in orders.columns:
return base_file_name + ".xlsx"
# "Order Month"
order_period_dates = sorted(orders[timestamp_col].unique())
if isinstance(order_period_dates[0], pd.Period):
first_date = order_period_dates[0].to_timestamp()
last_date = order_period_dates[-1].to_timestamp()
elif isinstance(order_period_dates[0], np.datetime64):
first_date = pd.Timestamp(order_period_dates[0])
last_date = pd.Timestamp(order_period_dates[-1])
else:
raise Exception('ERROR:', timestamp_col.upper, '-->', type(order_period_dates[0]))
coPurchases_file_name = base_file_name
coPurchases_file_name += "_by" + product_identifier.replace(' ', '')
coPurchases_file_name += '_' + first_date.month_name(locale='it_IT.utf8')[:3] + str(first_date.year)[2:]
coPurchases_file_name += '_' + last_date.month_name(locale='it_IT.utf8')[:3] + str(last_date.year)[2:]
coPurchases_file_name += ".xlsx"
return coPurchases_file_name
def create_paramsDF(min_support, min_confidence, min_lift):
params_df = Series(name = 'Params', dtype='float')
params_df['min_support'] = min_support
params_df['min_confidence'] = min_confidence
params_df['min_lift'] = min_lift
return params_df.to_frame()
def save_findings(frequentPatterns_df, associationRules_df, enhanced_associationRules, params_df, file_path, item_separator):
with ExcelWriter(file_path) as excelFile:
# A) Frequent patterns
savable_df = frequentPatterns_df.copy()
savable_df['Patterns'] = savable_df['Patterns'].apply(lambda items: item_separator.join(items))
savable_df.to_excel(excelFile, sheet_name = 'Patterns of co-purchases', index = False, freeze_panes = (1, 1))
# B) Association rules
savable_df = associationRules_df.copy()
for col_name in ['antecedents', 'consequents']:
savable_df[col_name] = savable_df[col_name].apply(lambda items: item_separator.join(items))
savable_df.to_excel(excelFile, sheet_name = 'Association rules', index = False, freeze_panes = (1, 2))
# C) Enhanced association rules
salvable_df = enhanced_associationRules.copy()
list_column_names = salvable_df.select_dtypes(exclude = np.number).columns
salvable_df[list_column_names] = salvable_df[list_column_names].applymap(
lambda items: ' | '.join(items) if not isinstance(items, float) else items)
salvable_df.to_excel(excelFile, sheet_name = 'Ass. rules with collections', index = False, freeze_panes = (1, 2))
# Save the parameters in a new sheet
params_df.to_excel(excelFile, sheet_name='Params')
print(f"The file '{path.basename(file_path)}' has saved.")