Your IP : 216.73.216.220


Current Path : /home/deltalab/PMS/recommendations/user_profiling/components/
Upload File :
Current File : //home/deltalab/PMS/recommendations/user_profiling/components/collection_analyzer.py

from datetime import datetime
from tabulate import tabulate
from itertools import combinations
import numpy as np
import pandas as pd

from _library import collectionBased_utils
from _library import choquetIntegral_utils

from abc import ABC,abstractmethod
from collections import defaultdict

class CollectionAnalyzer(ABC):
        
    def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim,type_recommendation):
        self.collection_name = collection_name
        self.collection_items = collection_items
        self.transactions = collectionBased_utils.getTransactions(orders)
        self.user_profile = user_profile
        self.limit_bundleDim = limit_bundleDim
        
        self.product_identifier = 'SKU'
        self.type_recommendation = type_recommendation
        if(self.user_profile['adult'] == 0):
            print("Filtering products to analyze (user not an adult)")
            self.collection_items = self.collection_items[self.collection_items['isforadult'] == False]
        # print("\nTRANSACTIONS:", len(self.transactions))
        # print(tabulate(self.transactions, headers = self.transactions.columns, tablefmt= 'pretty'))

    def _generateTuples(self):
        
        # Get the number of products
        self.all_products = self.collection_items[self.product_identifier].values
        print(f"[{self.collection_name.upper()}] ALL ITEMS ({len(self.all_products)}):", 
              ' | '.join(self.all_products))
        
        # Get the tuple dimensions
        dim_tuple = len(self.all_products)
        if (self.limit_bundleDim != -1) and (dim_tuple > self.limit_bundleDim):
            print(f"--> Bundle dimension has been limited to {self.limit_bundleDim} items (original {dim_tuple})")
            dim_tuple = self.limit_bundleDim
            
        tuple_dim = np.arange(start = 1, stop = dim_tuple + 1)
       
        # Generate the combinations
        tuples = list()
        for dim in tuple_dim:
            tuples.extend(list(combinations(self.all_products, r = dim)))
        
        # Turn the list of combiantions into a dataframe
        tuples_df = pd.DataFrame(tuples)
        tuples_df = tuples_df.agg(list, axis = 1).apply(
            lambda items: [item for item in items if item]).rename('Products').to_frame()
        
        print(f"--> {len(tuple_dim)} TUPLE DIMENTIONS:", tuple_dim, "--> COMBINATIONS:", len(tuples_df), "\n") 

        return tuples_df
    
    @abstractmethod
    def _compute_attributeUtilities(self, tuples_df):
        raise NotImplementedError("Must override")

    def analyze_collections(self, verbose):
        
        # 0) Generate the tuples
        tuples_df = self._generateTuples()
        
        if(len(tuples_df) == 0):
            return 0, [], ""

        # 1) Compute utilities of the attributes
        tuples_df = self._compute_attributeUtilities(tuples_df)
        
        # Visualize the attributes
        userBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'x_' in col_name ]
        itemBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'y_' in col_name ]
        cond_colNames = userBasedCond_colNames + itemBasedCond_colNames
        
        if verbose:
            print(f"\n{len(itemBasedCond_colNames)} ITEM-BASED ATTRIBUTES\n" + '-' * 30)
            print("-->", "\n--> ".join(itemBasedCond_colNames), "\n")
            print(f"\n{len(userBasedCond_colNames)} USER-BASED ATTRIBUTES\n" + '-' * 30)
            print("-->", "\n--> ".join(userBasedCond_colNames), "\n")
        
        newDict = False

        # Define capacity dictionary
        if not choquetIntegral_utils.CAPACITY_DICT or choquetIntegral_utils.CAPACITY_DICT_NAME != self.type_recommendation:
            capacityDict = choquetIntegral_utils.generate_capacityDict(cond_colNames,self.type_recommendation)
            choquetIntegral_utils.CAPACITY_DICT = capacityDict
            choquetIntegral_utils.CAPACITY_DICT_NAME = self.type_recommendation
            newDict = True
        else:
            capacityDict = choquetIntegral_utils.CAPACITY_DICT 
    
        # Pre-compute the capacities of all the combinations
        if not choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES or newDict:
            print("Precomputing the mobius values...\n")
                
            preComputed_subsetMobiusValues = choquetIntegral_utils.preComputed_mobiusRepresentation(cond_colNames, capacityDict)
            choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES = preComputed_subsetMobiusValues
        else:
            preComputed_subsetMobiusValues =  choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES 

            print("Using precomputed mobius values")
        
        # 1) ----------- [Aggregate attribute utilities] Linear sum (weighted sum) ------------------
        start = datetime.now()
        tuples_df['agg_weightedSum'] = tuples_df[cond_colNames].apply(
            func = lambda conditions: choquetIntegral_utils.weightedSum(conditions, capacityDict, verbose),
            axis = 1)
     
        if verbose:
            choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
            print("Weighted sum:", choquetIntegral_compDuration)
        # -----------------------------------------------------------------------------------------

        # 2) ----------- [CLASSIC implementation] Choquet integral ------------------
        # Aggregate attribute utilies --> Choquet integral [CLASSIC]
        # tuples_df['agg_choquetIntegral'] = tuples_df[cond_colNames].apply(
            # func = lambda conditions: choquet_integral(conditions, capacityDict, verbose = True),
            # axis = 1)
        # ---------------------------------------------------------------------------
        
        # 3) ----------- [Aggregate attribute utilities] Choquet integral [Mobius] ------------------
        print("\nComputing the choque integral [Mobius form]...\n")
        start = datetime.now()
        
        tuples_df['agg_mobiusChoquetIntegral'] = tuples_df[cond_colNames].apply(
            func = lambda conditions: choquetIntegral_utils.choquet_integral_mobius(
                conditions, preComputed_subsetMobiusValues, False),
            axis = 1)
        
        if verbose:
            choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
            print("CHOQUET INTEGRAL [Mobius]:", choquetIntegral_compDuration)
        # -----------------------------------------------------------------------------------------
        
        # Unpack the choquet integral outcomes
        tuples_df['agg_relevantAttributes'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[1])
        tuples_df['agg_mobiusChoquetIntegral'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[0])

        # Visualize the tuples
        tuples_df = tuples_df.sort_values(by = 'agg_mobiusChoquetIntegral', ascending = False).reset_index(drop = True)
        
        if verbose:
            print(tabulate(tuples_df.round(4), headers = tuples_df.columns, tablefmt= 'pretty'))
            
        # Compute the collection score
        collection_score = np.sum(tuples_df['agg_mobiusChoquetIntegral'])
        collection_score = np.round(collection_score, 2)
        
        # Retrive the most preferred items
        bestCollection_subset = tuples_df.iloc[0]
        bestCollection_items = bestCollection_subset['Products']
        
        # Build up the explaination string
        bestCollection_explaination = self.buildUpExplainationString(
            relevantAttributes = bestCollection_subset['agg_relevantAttributes'],collectionName=self.collection_name)
        
        if verbose:
            print(f"\nPREFERRED ITEMS ({len(bestCollection_items)}/{len(self.collection_items)}) -->", 
                ' || '.join(bestCollection_items))
            print("EXPLAINATION:", bestCollection_explaination)
            print('-' * 40)
            print("COLLECTION SCORE -->", collection_score)
            print('-' * 40)
        
        if self.collection_name.lower() == "rotaliana e paganella":
            print("Collection")
            #raise Exception("Analyzed production areas")
        return collection_score, bestCollection_items, bestCollection_explaination


    @abstractmethod
    def buildUpExplainationString(self,relevantAttributes, collectionName):
        raise NotImplementedError("Must override")


class ProductionAreaCollectionAnalyzer(CollectionAnalyzer):
        
    def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation):
        super().__init__(collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation)
    
    def _generateTuples(self):
        return super()._generateTuples()
    
    def _compute_relativeFrequency(self, tuple):
    
        # Compute the frequency
        intersection_cond = self.transactions['Items'].apply(
            lambda items: True if len(np.intersect1d(items, tuple)) == len(tuple) else False)
        tuple_intersect = self.transactions[intersection_cond]
        tuple_counter = tuple_intersect['Frequency'].sum()
                
        # Compute the relative frequency
        num_transactions = self.transactions['Frequency'].sum()
        relative_freq = tuple_counter / num_transactions

        return relative_freq
    
    def _compute_attributeUtilities(self, tuples_df):

        # 1A) [EXTRA ATTRIBUTES] Compute the relative frequency [0:1]
        tuples_df['y_relativeFreq'] = tuples_df['Products'].apply(func = self._compute_relativeFrequency)
        # 1B) [EXTRA ATTRIBUTES] Compute the relative dimension [0:1]
   
        tuples_df['y_numItems'] = tuples_df['Products'].apply(len) / len(self.all_products)
        
        # 2) [ATTRIBUTES] Check the utilities of the attributes
        
        # a) Retrieve the attributes attached to the products
        products_info = self.collection_items
        products_info.index = products_info[self.product_identifier]
        
        products_info = products_info.drop(
            columns = [
                'Type id', 
                'Seller', 
                'inTrentino_source', 
                'Frequency', 
                'indaco_sku'])
        products_info = products_info.rename(
            columns = {
                'Vendor': 'brand', 
                'Product Type': 'category',
                'Title':'name'})
        products_info = products_info.to_dict(orient = 'index')

        # b) Compute the utilities
        tuples_df = tuples_df.apply(
            func = lambda df_row: self.compute_attributeUtilities(
                df_row, products_info, self.user_profile), 
            axis = 1)

        tuples_df = tuples_df.round(2)
        
        return tuples_df

    def compute_attributeUtilities(self, df_rows,products_info,user_profile):
        tuple_products = df_rows['Products']
        # USER BASED
    
        # 1) ATTRIBUTE "userNovelty": Products not purchased
        alreadyPurchasedProducts = [product not in user_profile['unique_products'] for product in tuple_products]
        df_rows['x_userNovelty'] = np.sum(alreadyPurchasedProducts) / len(tuple_products)
        
        # 2) ATTRIBUTE "userCategories": Products of the purchased categories
        alreadyPurchasedCategory = [products_info[product]['category'] in user_profile['categories'] 
                                    for product in tuple_products]
        df_rows['x_userCategories'] = np.sum(alreadyPurchasedCategory) / len(tuple_products)
        
        # 3) ATTRIBUTE "userBrands": Products of the purchased brands
        alreadyPurchasedBrands = [products_info[product]['brand'] in user_profile['brands'] 
                                for product in tuple_products]
        df_rows['x_userBrands'] = np.sum(alreadyPurchasedBrands) / len(tuple_products) 

        # ITEM BASED
    
        # Aggregate the attribute values
        item_attributes = defaultdict(set)
        for product in tuple_products:
            for product_att, att_value in products_info[product].items():
                
                if isinstance(att_value, list):
                    item_attributes[product_att].update(att_value)
                else:
                    item_attributes[product_att].add(att_value)
        
        # 1) ATTRIBUTE "itemDissimilarity" --> name --> Jaccard
        product_names = [products_info[product]['name'] for product in tuple_products]
        if len(product_names) > 1:
            pairs = list(combinations(product_names, r = 2))
            pair_dissimilarities = [1 - collectionBased_utils.jaccard_similarity(pair[0], pair[1]) for pair in pairs]
            tuple_similarity = np.mean(pair_dissimilarities)
        else:
            tuple_similarity = 0
        df_rows['y_itemDissimilarity'] = tuple_similarity
        
        # 2) ATTRIBUTE "sameWarehouse" --> warehouses
        warehouses = list(item_attributes['warehouses'])
        if len(tuple_products) > 1:
            sameWarehouse = len(np.unique(warehouses)) == 1
            sameWarehouse = 1 if sameWarehouse else 0
        else: 
            sameWarehouse = 0
        df_rows['y_sameWarehouse'] = sameWarehouse

        # 3) ATTRIBUTE "sameConservationMethod" --> refrigerated
        refrigerated_products = list(item_attributes['refrigerated'])
        if len(tuple_products) > 1:
            sameConservationMethod = len(np.unique(refrigerated_products)) == 1
            sameConservationMethod = 1 if sameConservationMethod else 0
        else: 
            sameConservationMethod = 0
        df_rows['y_sameConservationMethod'] = sameConservationMethod
        
        # 4) ATTRIBUTE "similarWeights" --> weight [grams]
        weights = list(item_attributes['weight [grams]'])
    
        if len(tuple_products) > 1:
            relative_weightSimilarity = np.min(weights) / np.max(weights)
        else:
            relative_weightSimilarity = 0
        df_rows['y_similarWeights'] = relative_weightSimilarity 

        return df_rows

    def analyze_collections(self, verbose):
        return super().analyze_collections(verbose)

    def buildUpExplainationString(self,relevantAttributes, collectionName):
        explainationString = f"Ti potrebbero interessare prodotti"
        
        check_attributes = lambda attributes: all(np.isin(attributes, relevantAttributes))

        if check_attributes(['y_relativeFreq']):
            explainationString += " acquistati frequentemente"
        
        explainationString += f" provenienti dall'area {collectionName.upper()}"
            
        if check_attributes(['x_userNovelty', 'x_userCategories', 'x_userBrands']):
            explainationString += ', in linea con il tuo profilo'
        
        if check_attributes(['y_sameWarehouse', 'y_sameConservationMethod']):
            explainationString += " ed aggregati per ridurre l'impatto ambientale"
        
        return explainationString

class AttributeCollectionAnalyzer(CollectionAnalyzer):
        
    def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation):
        super().__init__(collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation)
    
    def _generateTuples(self):
        return super()._generateTuples()
    
    def compute_attributeUtilities(self, df_rows,products_info,user_profile):
        tuple_products = df_rows['Products']
        # USER BASED
    
        # 1) ATTRIBUTE "userNovelty": Products not purchased
        alreadyPurchasedProducts = [product not in user_profile['unique_products'] for product in tuple_products]
        df_rows['x_userNovelty'] = np.sum(alreadyPurchasedProducts) / len(tuple_products)
        
        # 2) ATTRIBUTE "userCategories": Products of the purchased categories
        alreadyPurchasedCategory = [products_info[product]['category'] in user_profile['categories'] 
                                    for product in tuple_products]
        df_rows['x_userCategories'] = np.sum(alreadyPurchasedCategory) / len(tuple_products)
        
        # 3) ATTRIBUTE "userBrands": Products of the purchased brands
        alreadyPurchasedBrands = [products_info[product]['brand'] in user_profile['brands'] 
                                for product in tuple_products]
        df_rows['x_userBrands'] = np.sum(alreadyPurchasedBrands) / len(tuple_products) 

        # ITEM BASED
        if(self.type_recommendation == "bio_food_recom"):
            df_rows['y_bioPercentage'] = sum([products_info[product]["biologic"] for product in tuple_products]) / len(tuple_products)
        
        if(self.type_recommendation == "glutenfree_food_recom"):
            df_rows['y_noglucidPercentage'] = sum([products_info[product]["gluten_free"] for product in tuple_products]) / len(tuple_products)
        
        if(self.type_recommendation == "vegan_food_recom"):
            df_rows['y_veganPercentage'] = sum([products_info[product]["vegan"] for product in tuple_products]) / len(tuple_products)
            
        if(self.type_recommendation == "biodynamic_food_recom"):
            df_rows['y_biodynamicPercentage'] = sum([products_info[product]["biodinamic"] for product in tuple_products]) / len(tuple_products)
        return df_rows

    def _compute_attributeUtilities(self, tuples_df):
                
        # 2) [ATTRIBUTES] Check the utilities of the attributes
        
        # a) Retrieve the attributes attached to the products
        products_info = self.collection_items
        products_info.index = products_info[self.product_identifier]
        
        products_info = products_info.drop(
            columns = [
                'Type id', 
                'Seller', 
                'inTrentino_source', 
                'Frequency', 
                'indaco_sku'])
        products_info = products_info.rename(
            columns = {
                'Vendor': 'brand', 
                'Product Type': 'category',
                'Title':'name'})
        products_info = products_info.to_dict(orient = 'index')
        
        # b) Compute the utilities
        tuples_df = tuples_df.apply(
            func = lambda df_row: self.compute_attributeUtilities(
                df_row, products_info, self.user_profile), 
            axis = 1)
        tuples_df = tuples_df.round(2)
        
        return tuples_df

    def analyze_collections(self, verbose):
        return super().analyze_collections(verbose)

    def buildUpExplainationString(self,relevantAttributes,collectionName=None):
        explainationString = f"Ti potrebbero interessare prodotti"

        check_attributes = lambda attributes: all(np.isin(attributes, relevantAttributes))

        if check_attributes(['x_userNovelty']):
            explainationString += " nuovi"
        if(check_attributes(['y_noglucidPercentage'])):
            explainationString += ", senza glutine,"
        else:
            explainationString +=","

        if check_attributes(['x_userCategories', 'x_userBrands']):
            explainationString += ' in linea con il tuo profilo'
        
        if check_attributes(['y_bioPercentage']) or check_attributes(['y_veganPercentage']) or check_attributes(['y_biodynamicPercentage']):
            explainationString += " considerando il fattore"
            expl = []
            if(check_attributes(['y_bioPercentage'])):
                expl.append("biologico")
            if(check_attributes(['y_veganPercentage'])):
                expl.append("vegano")
            if(check_attributes(['y_biodynamicPercentage'])):
                expl.append("biodinamico")
            explainationString += " " + ", ".join(expl)
        return explainationString