| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/components/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/components/collection_analyzer.py |
from datetime import datetime
from tabulate import tabulate
from itertools import combinations
import numpy as np
import pandas as pd
from _library.collectionAnalyzer_utils import choquetIntegral_utils
from _library.collectionAnalyzer_utils.collectionAnalyzer_utils import jaccard_similarity,getTransactions
from abc import ABC,abstractmethod
from collections import defaultdict
class CollectionAnalyzer(ABC):
PRECOMPUTED_SUBSET_MOBIUSVALUES = None
CAPACITY_DICT = None
CAPACITY_DICT_NAME = None
def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim,product_identifier,type_recommendation):
self.collection_name = collection_name
self.collection_items = collection_items
self.transactions = getTransactions(orders)
self.user_profile = user_profile
self.limit_bundleDim = limit_bundleDim
self.product_identifier = product_identifier
self.type_recommendation = type_recommendation
# print("\nTRANSACTIONS:", len(self.transactions))
# print(tabulate(self.transactions, headers = self.transactions.columns, tablefmt= 'pretty'))
self.category_based = True if self.product_identifier == 'Product Type' else False
def _generateTuples(self):
# Get the number of products
self.all_products = self.collection_items[self.product_identifier].unique()
print(f"[{self.collection_name.upper()}] ALL ITEMS ({len(self.all_products)}):",
' | '.join(self.all_products))
# Get the tuple dimensions
dim_tuple = len(self.all_products)
if (self.limit_bundleDim != -1) and (dim_tuple > self.limit_bundleDim):
print(f"--> Bundle dimension has been limited to {self.limit_bundleDim} items (original {dim_tuple})")
dim_tuple = self.limit_bundleDim
tuple_dim = np.arange(start = 1, stop = dim_tuple + 1)
# Generate the combinations
tuples = list()
for dim in tuple_dim:
tuples.extend(list(combinations(self.all_products, r = dim)))
# Turn the list of combiantions into a dataframe
tuples_df = pd.DataFrame(tuples)
tuples_df = tuples_df.agg(list, axis = 1).apply(
lambda items: [item for item in items if item]).rename('Products').to_frame()
print(f"--> {len(tuple_dim)} TUPLE DIMENTIONS:", tuple_dim, "--> COMBINATIONS:", len(tuples_df), "\n")
return tuples_df
@abstractmethod
def _compute_attributeUtilities(self, tuples_df):
raise NotImplementedError("Must override")
def analyze_collections(self, verbose):
# 0) Generate the tuples
tuples_df = self._generateTuples()
# 1) Compute utilities of the attributes
tuples_df = self._compute_attributeUtilities(tuples_df)
# Visualize the attributes
# userBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'x_' in col_name ]
subsetBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'y_' in col_name ]
referenceItemBasedCond_colNames = [col_name for col_name in tuples_df.columns if 'z_' in col_name ]
cond_colNames = subsetBasedCond_colNames + referenceItemBasedCond_colNames #userBasedCond_colNames
if verbose:
print(f"\n{len(subsetBasedCond_colNames)} SUBSET-BASED ATTRIBUTES\n" + '-' * 30)
print("-->", "\n--> ".join(subsetBasedCond_colNames), "\n")
print(f"\n{len(referenceItemBasedCond_colNames)} REFERENCEITEMS-BASED ATTRIBUTES\n" + '-' * 30)
print("-->", "\n--> ".join(referenceItemBasedCond_colNames), "\n")
newDict = False
# Define capacity dictionary
if not CollectionAnalyzer.CAPACITY_DICT or CollectionAnalyzer.CAPACITY_DICT_NAME != self.type_recommendation:
capacityDict = choquetIntegral_utils.generate_capacityDict(cond_colNames,self.type_recommendation)
CollectionAnalyzer.CAPACITY_DICT = capacityDict
CollectionAnalyzer.CAPACITY_DICT_NAME = self.type_recommendation
newDict = True
else:
capacityDict = CollectionAnalyzer.CAPACITY_DICT
# Pre-compute the capacities of all the combinations
if not CollectionAnalyzer.PRECOMPUTED_SUBSET_MOBIUSVALUES or newDict:
if verbose:
print("Precomputing the mobius values...\n")
preComputed_subsetMobiusValues = choquetIntegral_utils.preComputed_mobiusRepresentation(cond_colNames, capacityDict)
CollectionAnalyzer.PRECOMPUTED_SUBSET_MOBIUSVALUES = preComputed_subsetMobiusValues
else:
preComputed_subsetMobiusValues = CollectionAnalyzer.PRECOMPUTED_SUBSET_MOBIUSVALUES
if verbose:
print("Using precomputed mobius values")
# 1) ----------- [Aggregate attribute utilities] Linear sum (weighted sum) ------------------
#start = datetime.now()
#tuples_df['agg_weightedSum'] = tuples_df[cond_colNames].apply(
# func = lambda conditions: choquetIntegral_utils_OLD.weightedSum(conditions, capacityDict, verbose),
# axis = 1)
#if verbose:
# choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
# print("Weighted sum:", choquetIntegral_compDuration)
# -----------------------------------------------------------------------------------------
# 2) ----------- [CLASSIC implementation] Choquet integral ------------------
# Aggregate attribute utilies --> Choquet integral [CLASSIC]
# tuples_df['agg_choquetIntegral'] = tuples_df[cond_colNames].apply(
# func = lambda conditions: choquet_integral(conditions, capacityDict, verbose = True),
# axis = 1)
# ---------------------------------------------------------------------------
# 3) ----------- [Aggregate attribute utilities] Choquet integral [Mobius] ------------------
print("\nComputing the choque integral [Mobius form]...\n")
start = datetime.now()
tuples_df['agg_mobiusChoquetIntegral'] = tuples_df[cond_colNames].apply(
func = lambda conditions: choquetIntegral_utils.choquet_integral_mobius(
conditions, preComputed_subsetMobiusValues, verbose),
axis = 1)
if verbose:
choquetIntegral_compDuration = np.timedelta64(datetime.now() - start, 's')
print("CHOQUET INTEGRAL [Mobius]:", choquetIntegral_compDuration)
# -----------------------------------------------------------------------------------------
# Unpack the choquet integral outcomes
tuples_df['agg_relevantAttributes'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[1])
tuples_df['agg_mobiusChoquetIntegral'] = tuples_df['agg_mobiusChoquetIntegral'].apply(lambda outcome: outcome[0])
# Visualize the tuples
tuples_df = tuples_df.sort_values(by = 'agg_mobiusChoquetIntegral', ascending = False).reset_index(drop = True)
if verbose:
print(tabulate(tuples_df.round(4), headers = tuples_df.columns, tablefmt= 'pretty'))
# Compute the collection score
collection_score = np.sum(tuples_df['agg_mobiusChoquetIntegral'])
collection_score = np.round(collection_score, 2)
# Retrive the most preferred items
bestCollection_subset = tuples_df.iloc[0]
bestCollection_items = bestCollection_subset['Products']
# Build up the explaination string
bestCollection_explaination = self.buildUpExplainationString(
relevantAttributes = bestCollection_subset['agg_relevantAttributes'],
collectionName = self.collection_name)
if verbose:
print(f"\nPREFERRED ITEMS ({len(bestCollection_items)}/{len(self.collection_items)}) -->",
' || '.join(bestCollection_items))
print("EXPLAINATION:", bestCollection_explaination)
print('-' * 40)
print("COLLECTION SCORE -->", collection_score)
print('-' * 40)
#raise Exception("Analyzed production areas")
return collection_score, bestCollection_items, bestCollection_explaination
@abstractmethod
def buildUpExplainationString(self,relevantAttributes, collectionName):
raise NotImplementedError("Must override")
class RecomSystemBatchAnalyzer(CollectionAnalyzer):
PRECOMPUTED_SUBSET_MOBIUSVALUES = None
CAPACITY_DICT = None
CAPACITY_DICT_NAME = None
def __init__(self, collection_name, collection_items, orders, user_profile, limit_bundleDim, product_identifier, type_recommendation,reference_items):
super().__init__(collection_name, collection_items, orders, user_profile, limit_bundleDim, product_identifier, type_recommendation)
self.reference_items = reference_items
def _generateTuples(self):
return super()._generateTuples()
def _compute_relativeFrequency(self, tuple):
# Compute the frequency
intersection_cond = self.transactions['Items'].apply(
lambda items: True if len(np.intersect1d(items, tuple)) == len(tuple) else False)
tuple_intersect = self.transactions[intersection_cond]
tuple_counter = tuple_intersect['Frequency'].sum()
# Compute the relative frequency
num_transactions = self.transactions['Frequency'].sum()
relative_freq = tuple_counter / num_transactions
return relative_freq
def _compute_attributeUtilities(self, tuples_df):
# 1A) [EXTRA ATTRIBUTES] Compute the relative frequency [0:1]
tuples_df['y_relativeFreq'] = tuples_df['Products'].apply(func = self._compute_relativeFrequency)
# 1B) [EXTRA ATTRIBUTES] Compute the relative dimension [0:1]
tuples_df['y_numItems'] = tuples_df['Products'].apply(len) / len(self.all_products)
# 2) [ATTRIBUTES] Check the utilities of the attributes
products_info = self.collection_items
# Discard some irrilevant columns
products_info = products_info.drop(
columns = ['Type id', 'Seller', 'inTrentino_source', 'Frequency', 'indaco_sku'])
# a) Retrieve the attributes attached to the products
if self.category_based:
products_info = products_info.drop(columns = ['Title', 'SKU'])
products_info = products_info.explode(column = 'warehouses')
products_info = products_info.groupby(by = self.product_identifier).agg(
lambda values: sorted(set(values)))
else:
products_info.index = products_info[self.product_identifier]
# Rename columns
products_info = products_info.rename(
columns = {'Vendor': 'brand', 'Product Type': 'category' ,'Title':'name'})
products_info = products_info.to_dict(orient = 'index')
# df_rows, products_info, referenceItems, user_profile, categoryBased
# b) Compute the utilities
tuples_df = tuples_df.apply(
func = lambda df_row: self.compute_attributeUtilies(
df_row, products_info, self.reference_items, self.user_profile, self.category_based),
axis = 1)
tuples_df = tuples_df.round(2)
return tuples_df
def buildUpExplainationString(self,relevantAttributes, collectionName):
check_attributes = lambda attributes: all(np.isin(attributes, relevantAttributes))
explainationString = "Ti potrebbero interessare"
if check_attributes(['z_hasReferenceBrands']):
filler = ""
if check_attributes(['z_isBio', 'y_bioPercentage']):
filler = " biologici"
explainationString += f" altri prodotti{filler} del marchio"
if check_attributes(['y_bioPercentage']) and not np.isin(['z_isBio'],relevantAttributes):
explainationString += " con riguardo verso l'agricoltura biologica,"
if(not(check_attributes(['z_hasReferenceBrands'])) and not(check_attributes(['y_bioPercentage']) and not np.isin(['z_isBio'],relevantAttributes))):
explainationString += " prodotti"
if check_attributes(['y_relativeFreq']):
explainationString += " acquistati frequentemente, "
explainationString += f" provenienti dall'area {collectionName.upper()}"
if check_attributes(['y_sameWarehouse', 'y_sameConservationMethod']):
explainationString += " ed aggregati per ridurre l'impatto ambientale"
return explainationString
def compute_referenceItemBasedUtilities(self,df_rows, referenceItems, products_info, categoryBased):
tuple_products = df_rows['Products']
if(len(referenceItems) > 1):
import pdb;pdb.set_trace()
df_rows['z_isBio'] = 0
if(referenceItems[0]['biologic'] == 1):
df_rows['z_isBio'] = 1
# Define the mapped attributes
newFeatures = dict()
if categoryBased:
newFeatures['item_name'] = 'z_hasReferenceCategories'
else:
newFeatures['item_name'] = 'z_includeReferenceItems'
newFeatures['item_type'] = 'z_hasReferenceCategories'
newFeatures['vendor'] = 'z_hasReferenceBrands'
mappingProductsAtt = {'vendor': 'brand', 'item_type': 'category'}
for attName, colName in newFeatures.items():
referenceValues = np.unique([item[attName] for item in referenceItems])
if attName == 'item_name':
subsetValues = tuple_products
else:
subsetValues = []
for item in tuple_products:
value = products_info[item][mappingProductsAtt[attName]]
if isinstance(value, list):
subsetValues.extend(value)
else:
subsetValues.append(value)
subsetValues = np.unique(subsetValues)
df_rows[colName] = len(np.intersect1d(subsetValues, referenceValues))/ len(referenceValues)
def compute_subsetBasedUtilities(self,df_rows, products_info, categoryBased):
tuple_products = df_rows['Products']
# Aggregate the attribute values
item_attributes = defaultdict(set)
for product in tuple_products:
for product_att, att_value in products_info[product].items():
if isinstance(att_value, list):
item_attributes[product_att].update(att_value)
else:
item_attributes[product_att].add(att_value)
# 1) ATTRIBUTE "itemDissimilarity" --> name --> Jaccard
if categoryBased:
product_names = tuple_products
else:
product_names = [products_info[product]['name'] for product in tuple_products]
if len(product_names) > 1:
pairs = list(combinations(product_names, r = 2))
pair_dissimilarities = [1 - jaccard_similarity(pair[0], pair[1]) for pair in pairs]
tuple_similarity = np.mean(pair_dissimilarities)
else:
tuple_similarity = 0
df_rows['y_itemDissimilarity'] = tuple_similarity
# 2) ATTRIBUTE "sameWarehouse" --> warehouses
warehouses = list(item_attributes['warehouses'])
if len(tuple_products) > 1:
sameWarehouse = len(np.unique(warehouses)) == 1
sameWarehouse = 1 if sameWarehouse else 0
else:
sameWarehouse = 0
df_rows['y_sameWarehouse'] = sameWarehouse
# 3) ATTRIBUTE "sameConservationMethod" --> refrigerated
refrigerated_products = list(item_attributes['refrigerated'])
if len(tuple_products) > 1:
sameConservationMethod = len(np.unique(refrigerated_products)) == 1
sameConservationMethod = 1 if sameConservationMethod else 0
else:
sameConservationMethod = 0
df_rows['y_sameConservationMethod'] = sameConservationMethod
# 4) ATTRIBUTE "similarWeights" --> weight [grams]
weights = list(item_attributes['weight [grams]'])
if len(tuple_products) > 1:
relative_weightSimilarity = np.min(weights) / np.max(weights)
else:
relative_weightSimilarity = 0
df_rows['y_similarWeights'] = relative_weightSimilarity
df_rows['y_bioPercentage'] = sum(list(item_attributes['biologic'])) / len(list(item_attributes['biologic']))
df_rows['y_allBio'] = 1 if df_rows['y_bioPercentage'] == 1 else 0
return df_rows
def compute_attributeUtilies(self,df_rows, products_info, referenceItems, user_profile, categoryBased):
#print("\nUSER PROFILE:", user_profile)
# 1) subset-based (y): itemDissimilarity, sameConservationMethod, sameWarehouses, similarWeight
self.compute_subsetBasedUtilities(df_rows, products_info, categoryBased)
# 2) referenceItem-based
self.compute_referenceItemBasedUtilities(df_rows, referenceItems, products_info, categoryBased)
# 2) User-based (x): userNovelty, userCategories, userBrands, userProductionAreas
# compute_userBasedUtilities(df_rows, user_profile, products_info)
return df_rows
def analyze_collections(self, verbose):
return super().analyze_collections(verbose)