Your IP : 216.73.217.13


Current Path : /home/deltalab/PMS/recommendations/recomsys-datapreparation-batch/_library/
Upload File :
Current File : //home/deltalab/PMS/recommendations/recomsys-datapreparation-batch/_library/mine_assRules_utils.py

from collections import defaultdict
from json import dumps
from pickle import TRUE
from pandas import Timedelta
from PAMI.frequentPattern.basic import FPGrowth 
from mlxtend.frequent_patterns import association_rules 

from pandas import ExcelWriter, Series
from os import path

import numpy as np
import pandas as pd

def preProcessing_transaction(baskets,  item_separator, product_identifier):
    col_name = 'Transactions'
        
    # Turnaround: the library wants the this column name :/
    transactions_df = baskets.copy()
    transactions_df.rename(columns = {product_identifier: 'Transactions'}, inplace = True)

    # Turnaround to separate the items in the frequent pairs
    transactions_df.loc[:, col_name] = transactions_df[col_name].apply(
        func = lambda item_list: [item + item_separator for item in item_list])
    
    return transactions_df

def compute_frequentPatterns(transactions, item_separator, min_support = 0.1, limit_dim_pattern = 5):
            
    data_miner = FPGrowth.FPGrowth(transactions, minSup = min_support)
    data_miner.startMine()
    print('Runtime:', Timedelta(data_miner.getRuntime(), unit = 'sec'))
    
    # Get the patters as a dataframe
    frequentPatterns_df = data_miner.getPatternsAsDataFrame()  #.getPatterns()

    # Separate the items in each basket 
    frequentPatterns_df.loc[:, 'Patterns'] = frequentPatterns_df['Patterns'].str.rstrip(item_separator).str.split(item_separator.strip())

    # Remove whitespaces within each element within each basket
    frequentPatterns_df.loc[:, 'Patterns'] = frequentPatterns_df['Patterns'].apply(
        lambda item_list: [item.strip() for item in item_list])

    # Count the items
    frequentPatterns_df.insert(loc = 1, column = 'Num. pattern items', 
                            value = frequentPatterns_df['Patterns'].apply(lambda items: len(items)))
    # Sort the items
    frequentPatterns_df.sort_values(by = ['Num. pattern items', 'Support'], ascending = False, inplace = True)
    frequentPatterns_df.reset_index(inplace = True, drop = True)

    # Compute the true support representation (i.e., in percentage)
    total_orders = len(transactions)
    frequentPatterns_df.insert(2 ,'PercentageSupport', round(frequentPatterns_df['Support'] / total_orders, 4))

    if limit_dim_pattern != -1:
        filter_cond = frequentPatterns_df['Num. pattern items'] <= limit_dim_pattern
        frequentPatterns_df = frequentPatterns_df[filter_cond]
        frequentPatterns_df = frequentPatterns_df.reset_index(drop = True)

    return frequentPatterns_df

def generate_associationRules(frequentPatterns, min_confidence, min_lift, verbose = False):
    
    # Mine assocation rules (Condition 1: confidence)
    association_rules_df = association_rules(frequentPatterns,  metric = "confidence",  min_threshold = min_confidence)

    # Filter the rules (Condition 2: Lift)
    association_rules_df = association_rules_df[association_rules_df['lift'] > min_lift]

    # Turning the "frozensets" into "simple lists"
    association_rules_df['antecedents'] =  association_rules_df['antecedents'].map(list)
    association_rules_df['consequents'] =  association_rules_df['consequents'].map(list)

    # Round numerical values
    association_rules_df = association_rules_df.round(decimals = 4)
    association_rules_df['lift'] = association_rules_df['lift'].round(2)

    # Add dimension of antecedent/consequent basktes
    association_rules_df.insert(loc = 2, column='antecedent dimension', value = association_rules_df['antecedents'].map(len))
    association_rules_df.insert(loc = 4, column='consequents dimension', value = association_rules_df['consequents'].map(len))

    # Sort the values 
    association_rules_df.sort_values(by = ['antecedent dimension', 'confidence', 'lift', 'support'], 
                                    ascending = [True, False, False, False], inplace = True)
    # Reset the index
    association_rules_df = association_rules_df.reset_index(drop = True)

    # Drop unnecessary columns 
    association_rules_df.drop(columns = ['leverage', 'conviction'], inplace = True)
    new_col_order = association_rules_df.columns[:6].tolist() + ['lift', 'confidence', 'support']
    association_rules_df = association_rules_df.reindex(columns = new_col_order)
    
    # Visualize the association rules
    if len(association_rules_df) > 0:
        if verbose:
            print(association_rules_df)
        else:
            print("\nAssociation rules -->", len(association_rules_df), "\n")
    else:
        print("No association rules available using these parameters"\
            f"\n MIN CONFIDENCE: {min_confidence}\nMIN LIFT: {min_lift}")
    return association_rules_df

def find_commonLinkedCollections(products_w_linkedCollections, products, force_common_collections = True):

    # Retrieve the products linked to the collections
    raw_linked_collections = defaultdict(list)
    for product in products:
        product_collections = products_w_linkedCollections[product]
        for collection_name in product_collections:
            raw_linked_collections[collection_name].append(product)
    
    # Retrive the common collections
    linked_collections = set()
    for collection_name, collection_products in raw_linked_collections.items():
        if force_common_collections:
            if collection_products == products:
                linked_collections.add(collection_name)
        else:
            linked_collections.add(collection_name)
    
    return sorted(list(linked_collections))

def assRules_extractLinkedCollections(associationRules, grouped_collections, orders, product_identifier, 
                                      assRules_colNames, name_mapping):
   
    collection_divider = '|'
    
    # Keep only the relevant columns
    unnecessary_columns = ['antecedent support', 'consequent support', 'consequent support']
    enhancedAssociationRules_df = associationRules.drop(columns = unnecessary_columns).copy() 
        
    for col_name in assRules_colNames:
        for linked_type in grouped_collections.keys():
            
            # Retrieve the link name within the dataset
            link_name = name_mapping[linked_type]
                
            # Extract linked collections for all products
            df = orders[[product_identifier] + [link_name] + ['Transaction id']].copy()
            df = df.sort_values(by = 'Transaction id', ascending = False).drop(columns = 'Transaction id')
            df = df.dropna(subset = [product_identifier])
            
            raw_products_w_linkedCollections = df.to_dict(orient = 'records')
            
            # Turn the list into a dictionary 
            products_w_linkedCollections = defaultdict(set)
            for order in raw_products_w_linkedCollections:
                
                # Get info
                collection_string = order[link_name]
                item_name = order[product_identifier]

                if isinstance(collection_string, str):
                    linked_collections = collection_string.split(collection_divider)
                    
                    products_w_linkedCollections[item_name].update(linked_collections) 
            
            # Extract collections for each product
            new_col_name = col_name.upper() + ' - ' + link_name
            enhancedAssociationRules_df[new_col_name] = enhancedAssociationRules_df[col_name].apply(
                lambda products : find_commonLinkedCollections(products_w_linkedCollections, products))

    return enhancedAssociationRules_df

def filter_enhancedAssRules(enhancedAssociationRules_df, grouped_collections, assRules_colNames, name_mapping, keep_all_cols = False):
    
    redundant_cols = set()
    for idk, linked_type in enumerate(grouped_collections.keys()):
    
        # Build column names
        link_name = name_mapping[linked_type]
        col_names = (assRules_colNames[0].upper() + ' - ' + link_name, 
                     assRules_colNames[1].upper() + ' - ' + link_name)
        redundant_cols.update(col_names)

        # Find common collections
        common_collections = enhancedAssociationRules_df.apply(
            func = lambda df_row:  np.intersect1d(
                np.array(df_row[col_names[0]]), 
                np.array(df_row[col_names[1]])),
            axis = 1
        )
        
        # Turnaround to turn empty list into NaN values 
        common_collections = common_collections.apply(lambda item_list: item_list if len(item_list) > 0 else np.nan)

        # Insert the the values as a column
        enhancedAssociationRules_df.insert(loc = 4 + idk, column = 'Common' + ' - ' + link_name, value = common_collections)            
    
    # Drop items without any linked colletions
    enhancedAssociationRules_df = enhancedAssociationRules_df.dropna(
        subset = list(map(lambda name: 'Common - ' + name, name_mapping.values())), 
        how = 'all').reset_index(drop = True)

    # Sort values
    enhancedAssociationRules_df.sort_values(by = ['antecedent dimension', 'lift', 'confidence', 'support'], 
                                            ascending = [True, False, False, False], inplace = True)
    
    # Drop unneccesary column
    if not keep_all_cols:
        enhancedAssociationRules_df.drop(columns = redundant_cols, inplace = True)

    print("\nEnhanced association rules -->", len(enhancedAssociationRules_df), "\n")
    
    return enhancedAssociationRules_df

def generate_saving_fileName(orders, product_identifier, timestamp_col = None, base_file_name = "CoPurchases"):
    if timestamp_col == None or timestamp_col not in orders.columns:
        return base_file_name + ".xlsx"
    
    # "Order Month"
    order_period_dates = sorted(orders[timestamp_col].unique())
    if isinstance(order_period_dates[0], pd.Period):
        first_date = order_period_dates[0].to_timestamp()
        last_date = order_period_dates[-1].to_timestamp()
    elif isinstance(order_period_dates[0], np.datetime64):
        first_date = pd.Timestamp(order_period_dates[0])
        last_date = pd.Timestamp(order_period_dates[-1])
    else:
        raise Exception('ERROR:', timestamp_col.upper, '-->', type(order_period_dates[0]))
    
    coPurchases_file_name = base_file_name
    coPurchases_file_name += "_by" + product_identifier.replace(' ', '')
    coPurchases_file_name += '_' + first_date.month_name(locale='it_IT.utf8')[:3] + str(first_date.year)[2:]
    coPurchases_file_name += '_' + last_date.month_name(locale='it_IT.utf8')[:3] + str(last_date.year)[2:]
    coPurchases_file_name += ".xlsx"
    
    return coPurchases_file_name 

def create_paramsDF(min_support, min_confidence, min_lift):
    params_df = Series(name = 'Params', dtype='float')
    params_df['min_support'] = min_support
    params_df['min_confidence'] = min_confidence
    params_df['min_lift'] = min_lift
    return params_df.to_frame()

def save_findings(frequentPatterns_df, associationRules_df, enhanced_associationRules, params_df, file_path, item_separator):
    
    with ExcelWriter(file_path) as excelFile:
        
        # A) Frequent patterns
        savable_df = frequentPatterns_df.copy()
        savable_df['Patterns'] = savable_df['Patterns'].apply(lambda items: item_separator.join(items))
        savable_df.to_excel(excelFile, sheet_name = 'Patterns of co-purchases', index = False, freeze_panes = (1, 1))
        
        # B) Association rules
        savable_df = associationRules_df.copy()
        for col_name in ['antecedents', 'consequents']:
            savable_df[col_name] = savable_df[col_name].apply(lambda items: item_separator.join(items))
        savable_df.to_excel(excelFile, sheet_name = 'Association rules', index = False,  freeze_panes = (1, 2))
        
        # C) Enhanced association rules
        salvable_df = enhanced_associationRules.copy()
        list_column_names = salvable_df.select_dtypes(exclude = np.number).columns
        salvable_df[list_column_names] = salvable_df[list_column_names].applymap(
            lambda items: ' | '.join(items) if not isinstance(items, float) else items)
        salvable_df.to_excel(excelFile, sheet_name = 'Ass. rules with collections', index = False, freeze_panes = (1, 2))
        
        # Save the parameters in a new sheet
        params_df.to_excel(excelFile, sheet_name='Params')
        
        print(f"The file '{path.basename(file_path)}' has saved.")