| Current Path : /home/deltalab/PMS/recommendations/user_profiling/components/ |
| Current File : //home/deltalab/PMS/recommendations/user_profiling/components/collection_analyzer.py |
from datetime import datetime
from tabulate import tabulate
from itertools import combinations
import numpy as np
import pandas as pd
from _library import collectionBased_utils
from _library import choquetIntegral_utils
from abc import ABC,abstractmethod
from collections import defaultdict
class CollectionAnalyzer(ABC):
def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim,type_recommendation):
self.collection_name = collection_name
self.collection_items = collection_items
self.transactions = collectionBased_utils.getTransactions(orders)
self.user_profile = user_profile
self.limit_bundleDim = limit_bundleDim
self.product_identifier = 'SKU'
self.type_recommendation = type_recommendation
if(self.user_profile['adult'] == 0):
print("Filtering products to analyze (user not an adult)")
self.collection_items = self.collection_items[self.collection_items['isforadult'] == False]
# print("\nTRANSACTIONS:", len(self.transactions))
# print(tabulate(self.transactions, headers = self.transactions.columns, tablefmt= 'pretty'))
def _generateTuples(self):
# Get the number of products
self.all_products = self.collection_items[self.product_identifier].values
print(f"[{self.collection_name.upper()}] ALL ITEMS ({len(self.all_products)}):",
' | '.join(self.all_products))
# Get the tuple dimensions
dim_tuple = len(self.all_products)
if (self.limit_bundleDim != -1) and (dim_tuple > self.limit_bundleDim):
print(f"--> Bundle dimension has been limited to {self.limit_bundleDim} items (original {dim_tuple})")
dim_tuple = self.limit_bundleDim
tuple_dim = np.arange(start = 1, stop = dim_tuple + 1)
# Generate the combinations
tuples = list()
for dim in tuple_dim:
tuples.extend(list(combinations(self.all_products, r = dim)))
# Turn the list of combiantions into a dataframe
tuples_df = pd.DataFrame(tuples)
tuples_df = tuples_df.agg(list, axis = 1).apply(
lambda items: [item for item in items if item]).rename('Products').to_frame()
print(f"--> {len(tuple_dim)} TUPLE DIMENTIONS:", tuple_dim, "--> COMBINATIONS:", len(tuples_df), "\n")
return tuples_df
@abstractmethod
def _compute_attributeUtilities(self, tuples_df):
raise NotImplementedError("Must override")
def analyze_collections(self, verbose):
# 0) Generate the tuples
tuples_df = self._generateTuples()
if(len(tuples_df) == 0):
return 0, [], ""
# 1) Compute utilities of the attributes
tuples_df = self._compute_attributeUtilities(tuples_df)
# Visualize the attributes
userBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'x_' in col_name ]
itemBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'y_' in col_name ]
cond_colNames = userBasedCond_colNames + itemBasedCond_colNames
if verbose:
print(f"\n{len(itemBasedCond_colNames)} ITEM-BASED ATTRIBUTES\n" + '-' * 30)
print("-->", "\n--> ".join(itemBasedCond_colNames), "\n")
print(f"\n{len(userBasedCond_colNames)} USER-BASED ATTRIBUTES\n" + '-' * 30)
print("-->", "\n--> ".join(userBasedCond_colNames), "\n")
newDict = False
# Define capacity dictionary
if not choquetIntegral_utils.CAPACITY_DICT or choquetIntegral_utils.CAPACITY_DICT_NAME != self.type_recommendation:
capacityDict = choquetIntegral_utils.generate_capacityDict(cond_colNames,self.type_recommendation)
choquetIntegral_utils.CAPACITY_DICT = capacityDict
choquetIntegral_utils.CAPACITY_DICT_NAME = self.type_recommendation
newDict = True
else:
capacityDict = choquetIntegral_utils.CAPACITY_DICT
# Pre-compute the capacities of all the combinations
if not choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES or newDict:
print("Precomputing the mobius values...\n")
preComputed_subsetMobiusValues = choquetIntegral_utils.preComputed_mobiusRepresentation(cond_colNames, capacityDict)
choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES = preComputed_subsetMobiusValues
else:
preComputed_subsetMobiusValues = choquetIntegral_utils.PRECOMPUTED_SUBSET_MOBIUSVALUES
print("Using precomputed mobius values")
# 1) ----------- [Aggregate attribute utilities] Linear sum (weighted sum) ------------------
start = datetime.now()
tuples_df['agg_weightedSum'] = tuples_df[cond_colNames].apply(
func = lambda conditions: choquetIntegral_utils.weightedSum(conditions, capacityDict, verbose),
axis = 1)
if verbose:
choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
print("Weighted sum:", choquetIntegral_compDuration)
# -----------------------------------------------------------------------------------------
# 2) ----------- [CLASSIC implementation] Choquet integral ------------------
# Aggregate attribute utilies --> Choquet integral [CLASSIC]
# tuples_df['agg_choquetIntegral'] = tuples_df[cond_colNames].apply(
# func = lambda conditions: choquet_integral(conditions, capacityDict, verbose = True),
# axis = 1)
# ---------------------------------------------------------------------------
# 3) ----------- [Aggregate attribute utilities] Choquet integral [Mobius] ------------------
print("\nComputing the choque integral [Mobius form]...\n")
start = datetime.now()
tuples_df['agg_mobiusChoquetIntegral'] = tuples_df[cond_colNames].apply(
func = lambda conditions: choquetIntegral_utils.choquet_integral_mobius(
conditions, preComputed_subsetMobiusValues, False),
axis = 1)
if verbose:
choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
print("CHOQUET INTEGRAL [Mobius]:", choquetIntegral_compDuration)
# -----------------------------------------------------------------------------------------
# Unpack the choquet integral outcomes
tuples_df['agg_relevantAttributes'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[1])
tuples_df['agg_mobiusChoquetIntegral'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[0])
# Visualize the tuples
tuples_df = tuples_df.sort_values(by = 'agg_mobiusChoquetIntegral', ascending = False).reset_index(drop = True)
if verbose:
print(tabulate(tuples_df.round(4), headers = tuples_df.columns, tablefmt= 'pretty'))
# Compute the collection score
collection_score = np.sum(tuples_df['agg_mobiusChoquetIntegral'])
collection_score = np.round(collection_score, 2)
# Retrive the most preferred items
bestCollection_subset = tuples_df.iloc[0]
bestCollection_items = bestCollection_subset['Products']
# Build up the explaination string
bestCollection_explaination = self.buildUpExplainationString(
relevantAttributes = bestCollection_subset['agg_relevantAttributes'],collectionName=self.collection_name)
if verbose:
print(f"\nPREFERRED ITEMS ({len(bestCollection_items)}/{len(self.collection_items)}) -->",
' || '.join(bestCollection_items))
print("EXPLAINATION:", bestCollection_explaination)
print('-' * 40)
print("COLLECTION SCORE -->", collection_score)
print('-' * 40)
if self.collection_name.lower() == "rotaliana e paganella":
print("Collection")
#raise Exception("Analyzed production areas")
return collection_score, bestCollection_items, bestCollection_explaination
@abstractmethod
def buildUpExplainationString(self,relevantAttributes, collectionName):
raise NotImplementedError("Must override")
class ProductionAreaCollectionAnalyzer(CollectionAnalyzer):
def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation):
super().__init__(collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation)
def _generateTuples(self):
return super()._generateTuples()
def _compute_relativeFrequency(self, tuple):
# Compute the frequency
intersection_cond = self.transactions['Items'].apply(
lambda items: True if len(np.intersect1d(items, tuple)) == len(tuple) else False)
tuple_intersect = self.transactions[intersection_cond]
tuple_counter = tuple_intersect['Frequency'].sum()
# Compute the relative frequency
num_transactions = self.transactions['Frequency'].sum()
relative_freq = tuple_counter / num_transactions
return relative_freq
def _compute_attributeUtilities(self, tuples_df):
# 1A) [EXTRA ATTRIBUTES] Compute the relative frequency [0:1]
tuples_df['y_relativeFreq'] = tuples_df['Products'].apply(func = self._compute_relativeFrequency)
# 1B) [EXTRA ATTRIBUTES] Compute the relative dimension [0:1]
tuples_df['y_numItems'] = tuples_df['Products'].apply(len) / len(self.all_products)
# 2) [ATTRIBUTES] Check the utilities of the attributes
# a) Retrieve the attributes attached to the products
products_info = self.collection_items
products_info.index = products_info[self.product_identifier]
products_info = products_info.drop(
columns = [
'Type id',
'Seller',
'inTrentino_source',
'Frequency',
'indaco_sku'])
products_info = products_info.rename(
columns = {
'Vendor': 'brand',
'Product Type': 'category',
'Title':'name'})
products_info = products_info.to_dict(orient = 'index')
# b) Compute the utilities
tuples_df = tuples_df.apply(
func = lambda df_row: self.compute_attributeUtilities(
df_row, products_info, self.user_profile),
axis = 1)
tuples_df = tuples_df.round(2)
return tuples_df
def compute_attributeUtilities(self, df_rows,products_info,user_profile):
tuple_products = df_rows['Products']
# USER BASED
# 1) ATTRIBUTE "userNovelty": Products not purchased
alreadyPurchasedProducts = [product not in user_profile['unique_products'] for product in tuple_products]
df_rows['x_userNovelty'] = np.sum(alreadyPurchasedProducts) / len(tuple_products)
# 2) ATTRIBUTE "userCategories": Products of the purchased categories
alreadyPurchasedCategory = [products_info[product]['category'] in user_profile['categories']
for product in tuple_products]
df_rows['x_userCategories'] = np.sum(alreadyPurchasedCategory) / len(tuple_products)
# 3) ATTRIBUTE "userBrands": Products of the purchased brands
alreadyPurchasedBrands = [products_info[product]['brand'] in user_profile['brands']
for product in tuple_products]
df_rows['x_userBrands'] = np.sum(alreadyPurchasedBrands) / len(tuple_products)
# ITEM BASED
# Aggregate the attribute values
item_attributes = defaultdict(set)
for product in tuple_products:
for product_att, att_value in products_info[product].items():
if isinstance(att_value, list):
item_attributes[product_att].update(att_value)
else:
item_attributes[product_att].add(att_value)
# 1) ATTRIBUTE "itemDissimilarity" --> name --> Jaccard
product_names = [products_info[product]['name'] for product in tuple_products]
if len(product_names) > 1:
pairs = list(combinations(product_names, r = 2))
pair_dissimilarities = [1 - collectionBased_utils.jaccard_similarity(pair[0], pair[1]) for pair in pairs]
tuple_similarity = np.mean(pair_dissimilarities)
else:
tuple_similarity = 0
df_rows['y_itemDissimilarity'] = tuple_similarity
# 2) ATTRIBUTE "sameWarehouse" --> warehouses
warehouses = list(item_attributes['warehouses'])
if len(tuple_products) > 1:
sameWarehouse = len(np.unique(warehouses)) == 1
sameWarehouse = 1 if sameWarehouse else 0
else:
sameWarehouse = 0
df_rows['y_sameWarehouse'] = sameWarehouse
# 3) ATTRIBUTE "sameConservationMethod" --> refrigerated
refrigerated_products = list(item_attributes['refrigerated'])
if len(tuple_products) > 1:
sameConservationMethod = len(np.unique(refrigerated_products)) == 1
sameConservationMethod = 1 if sameConservationMethod else 0
else:
sameConservationMethod = 0
df_rows['y_sameConservationMethod'] = sameConservationMethod
# 4) ATTRIBUTE "similarWeights" --> weight [grams]
weights = list(item_attributes['weight [grams]'])
if len(tuple_products) > 1:
relative_weightSimilarity = np.min(weights) / np.max(weights)
else:
relative_weightSimilarity = 0
df_rows['y_similarWeights'] = relative_weightSimilarity
return df_rows
def analyze_collections(self, verbose):
return super().analyze_collections(verbose)
def buildUpExplainationString(self,relevantAttributes, collectionName):
explainationString = f"Ti potrebbero interessare prodotti"
check_attributes = lambda attributes: all(np.isin(attributes, relevantAttributes))
if check_attributes(['y_relativeFreq']):
explainationString += " acquistati frequentemente"
explainationString += f" provenienti dall'area {collectionName.upper()}"
if check_attributes(['x_userNovelty', 'x_userCategories', 'x_userBrands']):
explainationString += ', in linea con il tuo profilo'
if check_attributes(['y_sameWarehouse', 'y_sameConservationMethod']):
explainationString += " ed aggregati per ridurre l'impatto ambientale"
return explainationString
class AttributeCollectionAnalyzer(CollectionAnalyzer):
def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation):
super().__init__(collection_name, collection_items, orders, user_profile, limit_bundleDim, type_recommendation)
def _generateTuples(self):
return super()._generateTuples()
def compute_attributeUtilities(self, df_rows,products_info,user_profile):
tuple_products = df_rows['Products']
# USER BASED
# 1) ATTRIBUTE "userNovelty": Products not purchased
alreadyPurchasedProducts = [product not in user_profile['unique_products'] for product in tuple_products]
df_rows['x_userNovelty'] = np.sum(alreadyPurchasedProducts) / len(tuple_products)
# 2) ATTRIBUTE "userCategories": Products of the purchased categories
alreadyPurchasedCategory = [products_info[product]['category'] in user_profile['categories']
for product in tuple_products]
df_rows['x_userCategories'] = np.sum(alreadyPurchasedCategory) / len(tuple_products)
# 3) ATTRIBUTE "userBrands": Products of the purchased brands
alreadyPurchasedBrands = [products_info[product]['brand'] in user_profile['brands']
for product in tuple_products]
df_rows['x_userBrands'] = np.sum(alreadyPurchasedBrands) / len(tuple_products)
# ITEM BASED
if(self.type_recommendation == "bio_food_recom"):
df_rows['y_bioPercentage'] = sum([products_info[product]["biologic"] for product in tuple_products]) / len(tuple_products)
if(self.type_recommendation == "glutenfree_food_recom"):
df_rows['y_noglucidPercentage'] = sum([products_info[product]["gluten_free"] for product in tuple_products]) / len(tuple_products)
if(self.type_recommendation == "vegan_food_recom"):
df_rows['y_veganPercentage'] = sum([products_info[product]["vegan"] for product in tuple_products]) / len(tuple_products)
if(self.type_recommendation == "biodynamic_food_recom"):
df_rows['y_biodynamicPercentage'] = sum([products_info[product]["biodinamic"] for product in tuple_products]) / len(tuple_products)
return df_rows
def _compute_attributeUtilities(self, tuples_df):
# 2) [ATTRIBUTES] Check the utilities of the attributes
# a) Retrieve the attributes attached to the products
products_info = self.collection_items
products_info.index = products_info[self.product_identifier]
products_info = products_info.drop(
columns = [
'Type id',
'Seller',
'inTrentino_source',
'Frequency',
'indaco_sku'])
products_info = products_info.rename(
columns = {
'Vendor': 'brand',
'Product Type': 'category',
'Title':'name'})
products_info = products_info.to_dict(orient = 'index')
# b) Compute the utilities
tuples_df = tuples_df.apply(
func = lambda df_row: self.compute_attributeUtilities(
df_row, products_info, self.user_profile),
axis = 1)
tuples_df = tuples_df.round(2)
return tuples_df
def analyze_collections(self, verbose):
return super().analyze_collections(verbose)
def buildUpExplainationString(self,relevantAttributes,collectionName=None):
explainationString = f"Ti potrebbero interessare prodotti"
check_attributes = lambda attributes: all(np.isin(attributes, relevantAttributes))
if check_attributes(['x_userNovelty']):
explainationString += " nuovi"
if(check_attributes(['y_noglucidPercentage'])):
explainationString += ", senza glutine,"
else:
explainationString +=","
if check_attributes(['x_userCategories', 'x_userBrands']):
explainationString += ' in linea con il tuo profilo'
if check_attributes(['y_bioPercentage']) or check_attributes(['y_veganPercentage']) or check_attributes(['y_biodynamicPercentage']):
explainationString += " considerando il fattore"
expl = []
if(check_attributes(['y_bioPercentage'])):
expl.append("biologico")
if(check_attributes(['y_veganPercentage'])):
expl.append("vegano")
if(check_attributes(['y_biodynamicPercentage'])):
expl.append("biodinamico")
explainationString += " " + ", ".join(expl)
return explainationString