| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/mongodb_utils.py |
import numpy as np
import pandas as pd
import pymongo
from bson import ObjectId
from collections import defaultdict
from os import path
from json import dumps, load
from components.DbService import DbService
def connect_to_mongodb(hostname, port, user, password, db_name, verbose = False):
print("\n" + 120 * "-")
print("-" * 42,f"Connecting to MongoDB ({hostname})", "-" * 41)
print(120 * "-", "\n")
# Connect the the MongoDB server
db_server = pymongo.MongoClient(hostname, port, username=user, password=password, authSource=db_name, authMechanism='SCRAM-SHA-256')
# Connect tho the database
indaco_db = db_server[db_name]
if verbose:
collection_names = indaco_db.list_collection_names()
print(f"COLLECTIONS ({len(collection_names)}):", ', '.join(collection_names), "\n")
return indaco_db
def extractCategories(category_collection, language = 'it-IT'):
all_types = dict()
type_mapping = dict()
for category in category_collection:
category_name = [name_item['label'] for name_item in category['name'] if name_item['code'] == language][0]
type_mapping[category['_id']] = category_name
all_types[category_name] = {
'googleId': -1 if category['googleId'] is None else int(category['googleId']),
'isLeaf':category['isLeaf'],
'_id': category['_id'],
'parentId': category['parentId']
}
# Retrive the category name of the parent
for type_metadata in all_types.values():
item_parentId = type_metadata.pop('parentId')
if item_parentId:
type_metadata['parent'] = type_mapping[item_parentId]
else:
type_metadata['parent'] = None
all_types_df = pd.DataFrame().from_dict(all_types, orient = 'index')
all_types_df = all_types_df.reset_index()
all_types_df = all_types_df.rename(columns = {'index': 'categoryName'})
return all_types_df
def extractProducts(product_collection, consider_delatedProducts = False, consider_unavailableProducts = False,verbose = False):
print("\n" + 90 * "-")
print("-" * 29,f"Extracting the INDACO products", "-" * 29)
print(90 * "-", "\n")
# Get all products
all_products = pd.DataFrame(product_collection)
# Improve readability of some attributes
all_products['title'] = all_products['title'].str.strip()
all_products['brand'] = all_products['brand'].str.strip()
all_products.rename(columns = {'weight': 'weight [grams]'}, inplace=True)
# [FLAG: deleted] Consider or not the products that have been delated
if not consider_delatedProducts:
delated_products = all_products.loc[all_products['deleted'] == True, 'title']
all_products = all_products.drop(index = delated_products.index)
print(f'\nDelated products ({len(delated_products)}):', ', '.join(delated_products.values))
# [FLAG: sellBelowZero] Skipped products that are not available
if not consider_unavailableProducts:
unavailable_products = all_products.loc[all_products['sellBelowZero'] == False, 'title']
all_products = all_products.drop(index = unavailable_products.index).reset_index(drop = True)
print(f'Unavailable products ({len(unavailable_products)}):', ', '.join(unavailable_products.values), '\n')
if verbose:
print(f"ATTRIBUTES ({len(all_products.columns)}):")
print('-->', '\n--> '.join(all_products.columns), "\n")
# [ATTRIBUTES] Extract the production areas
all_products['production_areas'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_general_productionarea"])
all_products['production_areas'] = all_products['production_areas'].apply(
lambda values: values[0] if (len(values) > 0) and values[0] != None else -1)
# [Warehouse]
all_products['warehouse_id'] = all_products['inventoryLevels'].apply(
lambda items: [item['warehouseId'] for item in items if 'warehouseId' in item])
# [ATTRIBUTES] Extract biologic,biodinamic, vegan and gluten information
all_products['biologic'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_biological"])
all_products['biologic'] = all_products['biologic'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['vegan'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_vegan"])
all_products['vegan'] = all_products['vegan'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['biodinamic'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_biodinamic"])
all_products['biodinamic'] = all_products['biodinamic'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['gluten_free'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_glutenfree"])
all_products['gluten_free'] = all_products['gluten_free'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
# [ATTRIBUTES] is for adult
all_products['isforadult'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_general_isforadult"])
all_products['isforadult'] = all_products['isforadult'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
# Convert the production areas codes into names
file_path = path.join('_library', 'INDACO_collectionCodes.json')
with open(file_path) as json_file:
collectionTypes = load(json_file)
productionArea_codes = collectionTypes['production_areas']
all_products['production_areas'] = all_products['production_areas'].apply(
lambda area_code:
productionArea_codes[str(area_code)].capitalize()
if str(area_code) in productionArea_codes.keys() else f"Unknown (code:{area_code})"
if area_code != -1 else "")
# Filter and keep only the interest column
selected_columns = ['title', 'sku', 'brand', 'refrigerated', 'weight [grams]',
'_id', 'categoryId', 'partnerId', 'warehouse_id', 'production_areas','biologic','vegan',
'biodinamic','gluten_free','isforadult']
all_products = all_products[selected_columns]
return all_products
def enhanced_products_df(products, categories, sellers, warehouses, orders):
products = products.merge(sellers, left_on = "partnerId", right_on = "_id", suffixes = ('', '_seller'))
enhanced_df = products.merge(categories, how = 'left', left_on = "categoryId", right_on = "_id", suffixes = ('', '_category'))
enhanced_df.drop(columns = ['_id_seller', '_id_category', 'isLeaf'], inplace = True)
# Retrieve the warehouse names
warehouse_names = {item['_id']: item['name'] for item in warehouses.to_dict(orient = 'records')}
enhanced_df['warehouses'] = enhanced_df['warehouse_id'].apply(
lambda warehouses: [warehouse_names[warehouse_id] for warehouse_id in warehouses])
# 2) Fill with the default name
emptyCategory_value = ''
enhanced_df['categoryName'] = enhanced_df['categoryName'].fillna(value = emptyCategory_value)
# 3) Fill the google id
enhanced_df['googleId'] = enhanced_df['googleId'].fillna(value = -1)
# Visualize potential missing products
if len(enhanced_df) != len(products):
discarted_products = np.setdiff1d(products["title"].to_numpy(),
enhanced_df["title"].to_numpy())
print(f'{len(discarted_products)} products have been discarted due to an issue with the "categoryId"')
print("-" * 75)
print("-->", "\n--> ".join(sorted(discarted_products)))
item_w_unknownCategory = sorted(enhanced_df.loc[enhanced_df['categoryName'] == emptyCategory_value, 'title'].tolist())
if len(item_w_unknownCategory) > 0:
print("-" * 70, "\n" + "-" * 70)
print(f'[WARNING] The attribute "categoryId" has not been set to {len(item_w_unknownCategory)} products. \n'\
f'It has been filled with a category equal to "{emptyCategory_value}"')
print("-" * 70, "\n" + "-" * 70)
print("-->", "\n--> ".join(['(' + str(idk + 1) + ') '+ item
for idk, item in enumerate(item_w_unknownCategory)]))
# Mapping the column names
enhanced_df.rename(
columns = {
'title': 'Title',
'sku': 'indaco_sku',
'brand' : 'Vendor',
'categoryName': 'Product Type',
'googleId': 'Type id',
'companyName': 'Seller',
'_id': 'productId'},
inplace = True)
# [NEW ATTRIBUTE] inTrentino flag
enhanced_df['inTrentino_source'] = enhanced_df['Seller'].apply(lambda name: True if name == "Vendi24" else False)
# [NEW ATTRIBUTE] Item frequency within transactions/orders
num_transactions = len(orders['Transaction id'].unique())
normalizedFreq_func = lambda sku: len(orders.loc[orders['sku'] == sku, 'Transaction id'].unique()) / num_transactions
enhanced_df['Frequency'] = enhanced_df['indaco_sku'].apply(lambda sku: np.round(normalizedFreq_func(sku), 4))
# Minor normalization
#enhanced_df['indaco_sku'] = enhanced_df['indaco_sku'].str.capitalize()
enhanced_df['productId'] = enhanced_df['productId'].astype(str)
enhanced_df['Vendor'] = enhanced_df['Vendor'].fillna(value = "")
# Reorder columns
relevant_columns = ['Vendor', 'Title', 'Product Type', 'Type id', 'refrigerated', 'weight [grams]', 'warehouses',
'Seller', 'production_areas', 'inTrentino_source', 'Frequency', 'indaco_sku','biologic',
'isforadult','vegan','biodinamic','gluten_free'] #,'productId'
enhanced_df = enhanced_df[relevant_columns]
return enhanced_df
def enhanceCustomerProfiles(userProfiles, platfromProducts, platfromCategories):
file_path = path.join('_library', 'INDACO_collectionCodes.json')
with open(file_path) as json_file:
collectionTypes = load(json_file)
productionArea_codes = collectionTypes['production_areas']
# Retrieve information: (A) production_areas
userProfiles['production_areas'] = userProfiles['production_areas'].dropna().apply(
lambda productionAreasCodes: [productionArea_codes[code] for code in productionAreasCodes])
# Retrieve information: (B) categories
category_mapping = platfromCategories[['_id', 'categoryName']]
category_mapping.index = category_mapping['_id']
category_mapping = category_mapping['categoryName'].to_dict()
userProfiles['categories'] = userProfiles['categories'].apply(
lambda categoryIds: sorted([category_mapping[ObjectId(categoryId)] for categoryId in categoryIds]))
# Retrieve information: (C) unique_products
product_mapping = platfromProducts[['_id', 'sku']]
product_mapping.index = product_mapping['_id'].map(str)
product_mapping = product_mapping['sku'].to_dict()
userProfiles['unique_products'] = userProfiles['unique_products'].apply(
lambda productIds: sorted([product_mapping[str(productId)] for productId in productIds]))
# Retrieve information: (D) shopping_baskets
userProfiles['shopping_baskets'] = userProfiles['shopping_baskets'].apply(
lambda shopping_baskets: {trasactionId: sorted([product_mapping[str(productId)] for productId in shopping_basket])
for trasactionId, shopping_basket in shopping_baskets.items()})
return userProfiles
def map_indacoSKUs(indaco_products, inTrentino_products, product_identifier):
print("\n" + 120 * "-")
print("-" * 39, "Mapping the OLD SKUs with the INDACO SKUs", "-" * 38)
print(120 * "-", "\n")
old_SKUs = inTrentino_products['SKU'].map(str.lower).values
mapping_func = lambda new_sku: [old_sku for old_sku in old_SKUs if old_sku in new_sku]
mapped_SKUs = indaco_products['indaco_sku'].map(str.lower).apply(mapping_func)
indaco_products.insert(loc = 1, column = 'SKU', value = mapped_SKUs)
# Visualize new products
idk_new_products = indaco_products[indaco_products['SKU'].apply(len) == 0].index
# Fill the new products
indaco_products['SKU'] = indaco_products.apply(
func = lambda df_row: df_row['SKU'][0].upper() if len(df_row['SKU']) == 1 \
else df_row['indaco_sku'].split('-')[0].upper(),
axis = 1)
# Visualize new products
new_products = indaco_products.loc[idk_new_products, ['Title', 'Product Type', 'SKU']]\
.sort_values(by = ['Product Type', 'Title'])
print("-" * 90, "\n" + "-" * 90)
print("-" * 25, f'[INFO] {len(new_products)} new products have been found', "-" * 25)
print("-" * 90, "\n" + "-" * 90)
print("-->", "\n--> ".join([f"({idk + 1}) [{item['Product Type']}] {item['Title']} ({item['SKU']})"
for idk, item in new_products.iterrows()]))
print("-" * 90, "\n" + "-" * 90)
print("-" * 90, "\n" + "-" * 90, "\n")
# Also in inTrentino
unmatchedProducts = np.intersect1d(new_products['Title'].str.capitalize().values,
inTrentino_products['Title'].str.capitalize().values)
print(f"\nMISMATCHED SKUs ({len(unmatchedProducts)}):\n-->", '\n--> '.join(unmatchedProducts))
# Product names
product_names = indaco_products[product_identifier].tolist()
# Sku mapping
sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
return indaco_products, product_names, sku_mapping
def simplified_SKUs(indaco_products, product_identifier):
print("\n" + 120 * "-")
print("-" * 39, "Simplify the SKU", "-" * 38)
print(120 * "-", "\n")
# Fill the new products
indaco_products['SKU'] = indaco_products['indaco_sku'].apply(lambda indaco_sku: indaco_sku.split('-')[0].upper())
# Avoid duplicated simplified SKUs by including the second part
duplicated_products = indaco_products[indaco_products['SKU'].duplicated(keep = False)].index
indaco_products.loc[duplicated_products, 'SKU'] = indaco_products.loc[duplicated_products,'indaco_sku'].apply(
lambda indaco_sku: ''.join(indaco_sku.split('-')[:2]).upper())
# Product names
product_names = indaco_products[product_identifier].tolist()
# Sku mapping
sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
return indaco_products, product_names, sku_mapping
def find_bestExplaination(bundle_explainations):
if len(bundle_explainations) == 0:
return ""
unique_explainations, counter = np.unique(bundle_explainations, return_counts = True)
# Find the most frequent explaination
most_frequent_explainations = unique_explainations[np.argwhere(counter == np.max(counter)).reshape(-1)]
# Apprach A: pick the most frequent item
if len(most_frequent_explainations) == 1:
most_frequent_explaination = most_frequent_explainations[0]
else:
# Approach B: Pick up the explaination included in the recommendations with lower ranks (i.e., most generic explaination)
explainations_meanRankPos = {explaination: np.mean(np.where(bundle_explainations == explaination))
for explaination in most_frequent_explainations}
explainations_meanRankPos = dict(sorted(explainations_meanRankPos.items(), key = lambda item: item[1], reverse = False))
most_frequent_explaination = list(explainations_meanRankPos.keys())[0]
return most_frequent_explaination
def prepareData_for_saving(recommendations_byUser, sku_mapping, verbose):
recommendations_byUser = dict(sorted(recommendations_byUser.items()))
db_recommendations_byProduct = defaultdict(dict)
for user_id, recommendations_byProduct in recommendations_byUser.items():
for product_sku, linked_products in recommendations_byProduct.items():
if verbose:
print("user_id:", user_id)
print(f"PRODUCT: {product_sku} --> {len(linked_products)} linked products")
if user_id == -2:
allProducts = [sku_mapping[sku_product] for sku_product in linked_products]
db_recommendations_byProduct[product_sku]['all_linkedProducts'] = sorted(list(set(allProducts)))
db_recommendations_byProduct[product_sku]['linkedProducts_permutations'] = dict()
else:
# Retrieve the linked products
all_linkedProducts = db_recommendations_byProduct[product_sku]['all_linkedProducts']
# Retrieve the personalized order of the product (i.e., skus)
personalized_skuOrder = [sku_mapping[product['item_sku']] for product in linked_products]
# retrieve the explaination of the bundle
bundle_explainations = np.array([product['explaination'] for product in linked_products])
personalized_explaination = find_bestExplaination(bundle_explainations)
if verbose:
print(f"\nSTANDARD ({len(all_linkedProducts)}):", dumps(all_linkedProducts, indent = 4))
print(f"\nPERSONALIZED ({len(personalized_skuOrder)}) [user: {user_id}]:",
dumps(personalized_skuOrder, indent = 4), "\n")
if user_id == -1:
user_id = 'generic_user'
# Generate the personalized permutation
user_permutation = [all_linkedProducts.index(sku) for sku in personalized_skuOrder]
db_recommendations_byProduct[product_sku]['linkedProducts_permutations'][user_id] = {
'product_permutation':user_permutation,
'explaination': personalized_explaination}
return db_recommendations_byProduct
def write_recommendations_to_mongodb(db_e,recommendations_byUser, sku_mapping, dropExistingCollection = True, verbose = False):
print("\n" + 90 * "-")
print("-" * 35,f"Writing to MongoDB", "-" * 35)
print(90 * "-")
# Change the data representation and prepare data to be writen
db_recommendations_byProduct = prepareData_for_saving(recommendations_byUser, sku_mapping, verbose)
# Connect to the db
db = DbService("mongodb")
db.ENCRYPTING_KEY = db_e.ENCRYPTING_KEY
# Decrypt user ids
decryptedUsers = db_e.get_decryptedUsers(recommendations_byUser.keys())
# Collection name in which the recommendations is going to be saved
recommendation_collectionName = 'productbasedrecommendations'
if dropExistingCollection:
db.drop_existingTable(recommendation_collectionName)
print(f'Overwriting the existing collection "{recommendation_collectionName}"...\n')
else:
print(f'Writing the recommendations to the new collection "{recommendation_collectionName}"...\n')
# Write the item to the mongoDB
for reference_sku, product_info in db_recommendations_byProduct.items():
# 0) Retrieve the object id of the reference item
referenceProduct_indacoId = db.get_dBproduct(reference_sku, as_dict = True)
if not referenceProduct_indacoId:
continue
referenceProduct_indacoId = referenceProduct_indacoId['_id']
# 1) Retrieve the object ids of all the prodocuts
linked_ids = []
for item_sku in product_info['all_linkedProducts']:
product_indacoId = db.get_dBproduct(item_sku, as_dict = True)
if product_indacoId:
product_indacoId = product_indacoId['_id']
linked_ids.append(product_indacoId)
# 2) Map the decrypted user ids
userPermutations = dict()
for user_id, user_permutation in product_info['linkedProducts_permutations'].items():
# Decrypt the user id
if user_id in decryptedUsers.keys():
decrypted_userId = decryptedUsers[user_id]
else:
decrypted_userId = user_id
userPermutations[decrypted_userId] = user_permutation
# 3) Write the item to the mongoDB
new_dbItem = {'product_id': referenceProduct_indacoId, 'all_linkedProducts' : linked_ids, 'linkedProducts_permutations': userPermutations}
db.write_newDbItem(recommendation_collectionName, new_dbItem)
print(f"\nFinished, linked products added to {len(db_recommendations_byProduct)} products.\n")
def write_recommendations_to_mongodb_legacy(recommendations_byProduct, sku_mapping, verbose = False):
# Retrieve the linked products (SKUs) and the explainations
db_recommendations_byProduct = dict()
for reference_name, recommendations in recommendations_byProduct.items():
indaco_sku = sku_mapping[reference_name]
db_recommendations_byProduct[indaco_sku] = defaultdict(list)
for recommendation in recommendations:
recom_indaco_sku = sku_mapping[recommendation['item_sku']]
# Retrive the similar products
if "similar_products" in recommendation.keys():
similar_products = [sku_mapping[sku_product] for sku_product in recommendation["similar_products"].keys()]
else:
similar_products = []
enveloped_recom = {'sku': recom_indaco_sku, 'similar_products': similar_products}
# Append the recommendations
db_recommendations_byProduct[indaco_sku]['linked_products'].append(enveloped_recom)
db_recommendations_byProduct[indaco_sku]['explainations'].append(recommendation['explaination'])
# Find the best explaination for the bundle
bundle_explainations = np.array(db_recommendations_byProduct[indaco_sku].pop('explainations'))
personalized_explaination = find_bestExplaination(bundle_explainations)
db_recommendations_byProduct[indaco_sku]['explaination'] = personalized_explaination
# Connect to the db
db = DbService("mongodb")
# Write to the mongoDB
print("[LEGACY APPROACH] Writing linked products...\n")
for reference_sku, recommendations in db_recommendations_byProduct.items():
# Get info
linked_products = recommendations['linked_products']
explaination = recommendations['explaination']
# Retrieve the ids of the linked products
linked_ids = []
for idk, linked_product in enumerate(linked_products):
# Retrieve the product ID
item_sku = linked_product['sku']
try:
product_indacoId = db.get_dBproduct(item_sku, as_dict = True)['_id']
except Exception:
print(f"{item_sku} not found in the database!")
continue
# Retrieve the product IDs of the similar products
try:
similar_product_ids = [db.get_dBproduct(item_sku, as_dict = True)['_id']
for item_sku in linked_product['similar_products']]
except Exception:
similar_product_ids = []
print(f"[SIMILAR PRODUCTS] {item_sku} not found in the database!")
print("-->", product_indacoId)
#if product_indacoId != None:
db_product = {'rank': idk + 1, 'productId': product_indacoId, 'similarProducts': similar_product_ids}
linked_ids.append(db_product)
# Save the linked products (IDs)
db.update_attributeDbItem(object = reference_sku, attribute_name = "linkedProducts",
attribute_value = linked_ids)
# Save the explaination
db.update_attributeDbItem(object = reference_sku, attribute_name = "recomExplanation",
attribute_value = explaination)
if verbose:
print("\nREFERENCE:", reference_sku)
print(f"--> LINKED SKUs ({len(linked_products)}):-->"
, '\n--> '.join([item['sku'] + ': ' + ','.join(linked_product['similar_products'])
for item in linked_products]))
print("\n--> EXPLAINATION:", explaination)
print("\n--> Linked ids:", linked_ids)
if not verbose:
print(f"Finished, linked products added to {len(db_recommendations_byProduct)} products.")