| Current Path : /home/deltalab/PMS/recommendations/user_profiling/_library/ |
| Current File : //home/deltalab/PMS/recommendations/user_profiling/_library/collectionBased_utils.py |
from collections import defaultdict
from itertools import combinations
import numpy as np
def jaccard_similarity(stringA, stringB):
words_A = np.array(stringA.split(' '))
words_B = np.array(stringB.split(' '))
#Find intersection of two sets
intersection = np.intersect1d(words_A, words_B)
#Find union of two sets
union = np.union1d(words_A, words_B)
#Take the ratio of sizes
similarity = len(intersection)/len(union)
return similarity
def getTransactions(orders, product_identifier = 'SKU'):
orders = orders[['Transaction id', product_identifier]]
# Group the single ordered items according to the transaction
grouped_products_df = orders.groupby(by = 'Transaction id', as_index = False).agg(lambda x: set(x))
# Generate the transaction df
transaction_df = grouped_products_df[product_identifier].map('|'.join).value_counts()
transaction_df = transaction_df.rename('Frequency').to_frame()
transaction_df = transaction_df.reset_index(drop = False).rename(columns = {'index': 'Items'})
transaction_df['Items'] = transaction_df['Items'].str.split('|')
transaction_df['num_items'] = transaction_df['Items'].apply(len)
transaction_df = transaction_df.sort_values(by = ['Frequency', 'num_items'], ascending = False)
transaction_df = transaction_df.reset_index(drop = True)
transaction_df = transaction_df.drop(columns = 'num_items')
return transaction_df
def compute_userBasedUtilities(df_rows, user_profile, products_info):
tuple_products = df_rows['Products']
# 1) ATTRIBUTE "userNovelty": Products not purchased
alreadyPurchasedProducts = [product not in user_profile['unique_products'] for product in tuple_products]
df_rows['x_userNovelty'] = np.sum(alreadyPurchasedProducts) / len(tuple_products)
# 2) ATTRIBUTE "userCategories": Products of the purchased categories
alreadyPurchasedCategory = [products_info[product]['category'] in user_profile['categories']
for product in tuple_products]
df_rows['x_userCategories'] = np.sum(alreadyPurchasedCategory) / len(tuple_products)
# 3) ATTRIBUTE "userBrands": Products of the purchased brands
alreadyPurchasedBrands = [products_info[product]['brand'] in user_profile['brands']
for product in tuple_products]
df_rows['x_userBrands'] = np.sum(alreadyPurchasedBrands) / len(tuple_products)
def compute_itemBasedUtilities(df_rows, products_info):
tuple_products = df_rows['Products']
# Aggregate the attribute values
item_attributes = defaultdict(set)
for product in tuple_products:
for product_att, att_value in products_info[product].items():
if isinstance(att_value, list):
item_attributes[product_att].update(att_value)
else:
item_attributes[product_att].add(att_value)
# 1) ATTRIBUTE "itemDissimilarity" --> name --> Jaccard
product_names = [products_info[product]['name'] for product in tuple_products]
if len(product_names) > 1:
pairs = list(combinations(product_names, r = 2))
pair_dissimilarities = [1 - jaccard_similarity(pair[0], pair[1]) for pair in pairs]
tuple_similarity = np.mean(pair_dissimilarities)
else:
tuple_similarity = 0
df_rows['y_itemDissimilarity'] = tuple_similarity
# 2) ATTRIBUTE "sameWarehouse" --> warehouses
warehouses = list(item_attributes['warehouses'])
if len(tuple_products) > 1:
sameWarehouse = len(np.unique(warehouses)) == 1
sameWarehouse = 1 if sameWarehouse else 0
else:
sameWarehouse = 0
df_rows['y_sameWarehouse'] = sameWarehouse
# 3) ATTRIBUTE "sameConservationMethod" --> refrigerated
refrigerated_products = list(item_attributes['refrigerated'])
if len(tuple_products) > 1:
sameConservationMethod = len(np.unique(refrigerated_products)) == 1
sameConservationMethod = 1 if sameConservationMethod else 0
else:
sameConservationMethod = 0
df_rows['y_sameConservationMethod'] = sameConservationMethod
# 4) ATTRIBUTE "similarWeights" --> weight [grams]
weights = list(item_attributes['weight [grams]'])
if len(tuple_products) > 1:
relative_weightSimilarity = np.min(weights) / np.max(weights)
else:
relative_weightSimilarity = 0
df_rows['y_similarWeights'] = relative_weightSimilarity
def compute_attributeUtilies(df_rows, products_info, user_profile):
#print("\nUSER PROFILE:", user_profile)
# 1) Item-based (y): itemDissimilarity, sameConservationMethod, sameWarehouses, similarWeight
compute_itemBasedUtilities(df_rows, products_info)
# 2) User-based (x): userNovelty, userCategories, userBrands, userProductionAreas
compute_userBasedUtilities(df_rows, user_profile, products_info)
return df_rows