| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/components/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/components/semiRandom_RS.py |
from collections import defaultdict
import numpy as np
import pandas as pd
from tabulate import tabulate
from _library.toolkit import jaccard_similarity
class semiRandom_RS:
def __init__(self, products, categories, filter_source_platform):
self.products = products.copy()
self.categories = categories
self.filter_source_platform = filter_source_platform
self.rs_codeName = 'randomProducts'
self.verbose = False
def getParentCategories(self, products, category_name, minItems = 8):
allParentCategories = defaultdict(list)
#self.verbose = True
# Visit the parents until the root category
category_to_visit = category_name.upper()
while(category_to_visit != None):
cond = self.categories['categoryName'].str.upper() == category_to_visit
parent_category = self.categories.loc[cond, 'parent']
if len(parent_category) == 0:
category_to_visit = None
else:
parent_category = parent_category.values[0]
print(category_to_visit, "-->", parent_category)
# Get its leaves
cond = self.categories['parent'].str.upper() == parent_category.upper()
parent_leaves = self.categories.loc[cond, 'categoryName'].values
allParentCategories[parent_category].extend(parent_leaves)
category_to_visit = parent_category
if self.verbose:
print("\nCATEGORY:", category_name, f"(TARGET: {minItems} items)")
print(f"--> ALL PARENTS ({len(allParentCategories.keys())}):",
' | '.join(allParentCategories.keys()))
# Check if there are products for these categories
allConnectedCategories = defaultdict(dict)
getItems = lambda df, type: df.loc[df['Product Type'].str.upper() == type.upper(), :]
for idk, (parent_type, parent_leaves) in enumerate(allParentCategories.copy().items()):
parent_items = getItems(products, parent_type)
if len(parent_items) > 0:
allConnectedCategories[idk][parent_type] = 0
if self.verbose:
print(f"{parent_type} [root]: {len(parent_items)} items")
else:
for leaf_type in parent_leaves:
leaf_items = getItems(products, leaf_type)
if len(leaf_items) > 0:
allConnectedCategories[idk][leaf_type] = len(leaf_items)
if self.verbose:
print(f"{leaf_type} [leaf]: {len(leaf_type)} items")
allConnectedCategories = dict(sorted(allConnectedCategories.items(),
key = lambda item: item[0]))
print("\nallConnectedCategories:", allConnectedCategories)
if self.verbose:
filteredParents = [type_name for parent_level in allConnectedCategories.values()
for type_name in parent_level.keys()]
print(f"--> FILTERED PARENTS ({len(filteredParents)}):",
list(allConnectedCategories.values()))
if len(allConnectedCategories.keys()) == 0:
# Retrieve all the categories
allTypes = list(products['Product Type'].str.upper().unique())
# Rank the categories according to its similarity with the reference type
rankedTypes = dict()
for type in allTypes:
similarities = [jaccard_similarity(type, category_name)]
for leavesCategories in allConnectedCategories.values():
similarities.extend([jaccard_similarity(type, leavesCategory) for leavesCategory in leavesCategories])
rankedTypes[type] = np.max(similarities)
rankedTypes = dict(sorted(rankedTypes.items(),
key = lambda dictItem: (dictItem[1], dictItem[0]),
reverse = True))
if self.verbose:
print("--> all ranked categories", rankedTypes)
rankedTypes = np.array(list(rankedTypes.keys()))
return rankedTypes
connectedCategories = []
attachedItems_counter = 0
for categories in allConnectedCategories.values():
# Retrive the category names
categoriy_names = list(categories.keys())
connectedCategories.extend(categoriy_names)
# Count the attached items
total_categoryItems = np.sum(list(categories.values()))
attachedItems_counter += total_categoryItems
if attachedItems_counter >= minItems:
break
if self.verbose:
print(f"--> MIN FILTERED PARENTS ({len(connectedCategories)}):",
' | '.join(connectedCategories))
return np.array(connectedCategories)
def generate_filtered_productsDF(self, product_sku, previous_recommendedSKUs, num_randomItems):
products_df = self.products
# Get product info
item_info = products_df.loc[products_df['SKU'] == product_sku].iloc[0]
item_vendor = item_info['Vendor']
item_platformSource = item_info['inTrentino_source']
item_type = item_info['Product Type']
product_name = item_info['Title']
print("\n\nREFERENCE")
print(item_info)
if self.verbose:
print(f"\n[{item_type}, {item_vendor}] {product_sku}")
print(f"--> Already recommended ({len(previous_recommendedSKUs)}):", previous_recommendedSKUs)
# [Filter products] Exclude the referenece product
item_idx = products_df.loc[products_df['SKU'] == product_sku, :].index
products_df = products_df.drop(index = item_idx)
# [Filter products] Remove the items already considered
products_df = products_df.drop(index = products_df[products_df['SKU'].isin(previous_recommendedSKUs)].index)
# [Filter products] Filter the products according to the source platform
if self.filter_source_platform:
products_df = products_df[products_df['inTrentino_source'] == item_platformSource].reset_index(drop = True)
if self.verbose:
print("--> (A) Filtering the source platfrom")
print(products_df)
# [TRY] Select, if possible, the products of the original product vendor
# --------------------------------------------------------------------------
filtered_products_df = products_df[products_df['Vendor'] == item_vendor].reset_index(drop = True)
normalize = lambda item, max_val, min_val: (item - min_val + 0.001) / (max_val - min_val + 0.001)
if len(filtered_products_df) > 0:
# Number of items that should be filled
num_missing_items = num_randomItems - len(filtered_products_df)
if num_missing_items > 0:
# Get the parent categories
connectedCategories = self.getParentCategories(
products = products_df.drop(index = filtered_products_df.index),
category_name = item_type,
minItems = num_missing_items)
extra_df = products_df[products_df['Product Type'].isin(connectedCategories)].copy()
# Generate the weights
extra_df['type_weights'] = extra_df['Product Type'].apply(
lambda type: len(connectedCategories) - np.argwhere(connectedCategories == type).item())
extra_df['type_weights'] = extra_df['type_weights'].apply(
lambda weight: normalize(weight, extra_df['type_weights'].dropna().max(),
extra_df['type_weights'].dropna().min()))
if self.verbose:
print("LIST EXTRA PRODUCTS \n", extra_df)
# Get the extra items
extra_df = extra_df.sort_values(by = ['type_weights', 'Frequency', 'Title'], ascending = False)
extra_extra_items = extra_df.iloc[:num_missing_items, :]
#extra_extra_items = extra_df.sample(n = num_missing_items, weights = extra_df['type_weights'].values)
# Merge the extra items with the previous
filtered_products_df = pd.concat([filtered_products_df, extra_extra_items]).reset_index(drop = True)
if self.verbose:
print(f"--> (B.2) Filtering using some items from the same "\
f"vendor with extra items ({len(extra_extra_items)}, {', '.join(connectedCategories)})")
print(filtered_products_df)
else:
if self.verbose:
print(f"--> (B.1) Filtering using {len(filtered_products_df)} items from the same vendor")
print(products_df)
products_df = filtered_products_df
# --------------------------------------------------------------------------
else:
# Get the parent categories
connectedCategories = self.getParentCategories(
products = products_df.copy(),
category_name = item_type,
minItems = num_randomItems)
# Filter the products df according to the parent categories
products_df = products_df[products_df['Product Type'].isin(connectedCategories)]
# Generate the weights
products_df['type_weights'] = products_df['Product Type'].apply(
lambda type: len(connectedCategories) - np.argwhere(connectedCategories == type).item())
products_df['type_weights'] = products_df['type_weights'].apply(
lambda weight: normalize(weight, products_df['type_weights'].dropna().max(),
products_df['type_weights'].dropna().min()))
products_df = products_df.sort_values(by = 'type_weights', ascending = False)
if self.verbose:
print(f"--> (B.3) Filtering using the connected categories ({', '.join(connectedCategories)})")
print(products_df)
# Compute whether the selected products have the same vendor
products_df['sameVendor'] = products_df['Vendor'] == item_vendor
# Compute the dissimilarity
dissimilarity_func = lambda item_name: 1 - jaccard_similarity(item_name.upper(), product_name.upper())
products_df['reference_dissimilarity'] = products_df['Title'].apply(dissimilarity_func)
if self.verbose:
print(f"\nFiltered products ({len(products_df)}):")
print(tabulate(products_df, headers = products_df.columns, tablefmt = 'pretty'))
print("-" * 40)
return products_df
def retrieveProducts_randomly(self, products_df, num_randomItems):
# Retrieve K random items
if len(products_df) > 1:
# Compute the weights
min_dissimilary = products_df['reference_dissimilarity'].min()
max_dissimilary = products_df['reference_dissimilarity'].max()
normalize = lambda item: (item - min_dissimilary + 0.001) / (max_dissimilary - min_dissimilary + 0.001)
products_df['reference_dissimilarity'] = products_df['reference_dissimilarity'].apply(normalize)
if 'type_weights' in products_df.columns:
products_df['weight'] = products_df.apply(
func = lambda df_row: np.nanmean([
df_row['reference_dissimilarity'],
df_row['type_weights']]),
axis = 1)
else:
products_df['weight'] = products_df['reference_dissimilarity']
# Uniform distribution
if products_df['weight'].isnull().values.all():
products_df['weight'] = [1/(idk + 1) for idk in range(len(products_df))]
else:
# Penalize other products
cond = products_df['sameVendor'] == False
products_df.loc[cond, 'weight'] /= 2
# If for some reasons the number of products to sample exceeds
# the total number of products in product_df, sample the entire dataframe
n_to_sample = num_randomItems
if(num_randomItems > len(products_df)):
n_to_sample = len(products_df)
random_items = products_df.sample(
n = n_to_sample,
weights = products_df['weight'].values,
random_state = 101).reset_index(drop = True)
# Sort the value
random_items = random_items.sort_values(
by = ['sameVendor', 'weight', 'Frequency', 'Title'],
ascending = False).reset_index(drop = True)
else:
random_items = products_df
if self.verbose:
print(f"\nRandom products ({len(random_items)}):")
print(tabulate(random_items, headers = random_items.columns, tablefmt = 'pretty'))
print("-" * 40)
return random_items
def generate_connectedProducts(self, product_sku, previous_recommendedSKUs, num_randomItems):
if self.verbose:
print("\n" + "-" * 60)
print("-" * 15, f"PRODUCT: {product_sku}", "-" * 15)
print("-" * 60)
explaination_string = "Ti potrebbe interessare"
# Filter the products df according to some criteria
filtered_products_df = self.generate_filtered_productsDF(product_sku, previous_recommendedSKUs, num_randomItems)
# Retrieve some products randomly
random_items = self.retrieveProducts_randomly(filtered_products_df, num_randomItems)
random_items = random_items[['Vendor', 'SKU', 'Title', 'Product Type', 'Seller', 'inTrentino_source']].copy()
# Add the necessary attributes
random_items['rank'] = [len(previous_recommendedSKUs) + (idk + 1) for idk in random_items.index]
random_items['item_name'] = random_items['Title']
random_items['explaination'] = explaination_string
random_items['rs_source'] = self.rs_codeName
# Rename the columns as usual
random_items = random_items.rename(
columns = {
'SKU': 'item_sku',
'Title': 'product_name',
'Product Type': 'item_type',
'Vendor': 'item_vendor',
'Seller': 'seller'
}
)
# Turn the product into a list
random_items = random_items.to_dict(orient = 'records')
return random_items