| Current Path : /home/deltalab/PMS/recommendations/user_profiling/_library/ |
| Current File : //home/deltalab/PMS/recommendations/user_profiling/_library/mongodb_utils.py |
import numpy as np
import pandas as pd
import pymongo
from os import path
from _library.io_toolkit import load_collectionTypes
def connect_to_mongodb(hostname, port, user, password, db_name, verbose = False):
print("\n" + 120 * "-")
print("-" * 42,f"Connecting to MongoDB ({hostname})", "-" * 41)
print(120 * "-", "\n")
# Connect the the MongoDB server
db_server = pymongo.MongoClient(hostname, port, username=user, password=password, authSource=db_name, authMechanism='SCRAM-SHA-256')
# Connect tho the database
indaco_db = db_server['pms']
if verbose:
collection_names = indaco_db.list_collection_names()
print(f"COLLECTIONS ({len(collection_names)}):", ', '.join(collection_names), "\n")
return indaco_db
def extractCategories(category_collection, language = 'it-IT'):
all_types = dict()
type_mapping = dict()
for category in category_collection:
category_name = [name_item['label'] for name_item in category['name'] if name_item['code'] == language][0]
type_mapping[category['_id']] = category_name
all_types[category_name] = {
'googleId': -1 if category['googleId'] is None else int(category['googleId']),
'isLeaf':category['isLeaf'],
'_id': category['_id'],
'parentId': category['parentId']
}
# Retrive the category name of the parent
for type_metadata in all_types.values():
item_parentId = type_metadata.pop('parentId')
if item_parentId:
type_metadata['parent'] = type_mapping[item_parentId]
else:
type_metadata['parent'] = None
all_types_df = pd.DataFrame().from_dict(all_types, orient = 'index')
all_types_df = all_types_df.reset_index()
all_types_df = all_types_df.rename(columns = {'index': 'categoryName'})
return all_types_df
def extractProducts(product_collection, consider_delatedProducts = False, consider_unavailableProducts = False, verbose = False):
print("\n" + 90 * "-")
print("-" * 29,f"Extracting the INDACO products", "-" * 29)
print(90 * "-", "\n")
# Get all products
all_products = pd.DataFrame(product_collection)
# Improve readability of some attributes
all_products['title'] = all_products['title'].str.strip()
all_products['brand'] = all_products['brand'].str.strip()
all_products.rename(columns = {'weight': 'weight [grams]'}, inplace=True)
# [FLAG: deleted] Consider or not the products that have been delated
if not consider_delatedProducts:
delated_products = all_products.loc[all_products['deleted'] == True, 'title']
all_products = all_products.drop(index = delated_products.index)
print(f'\nDelated products ({len(delated_products)}):', ', '.join(delated_products.values))
# [FLAG: sellBelowZero] Skipped products that are not available
if not consider_unavailableProducts:
unavailable_products = all_products.loc[all_products['sellBelowZero'] == False, 'title']
all_products = all_products.drop(index = unavailable_products.index).reset_index(drop = True)
print(f'Unavailable products ({len(unavailable_products)}):', ', '.join(unavailable_products.values), '\n')
if verbose:
print(f"ATTRIBUTES ({len(all_products.columns)}):")
print('-->', '\n--> '.join(all_products.columns), "\n")
# [ATTRIBUTES] Extract the production areas
all_products['production_areas'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_general_productionarea"])
all_products['production_areas'] = all_products['production_areas'].apply(
lambda values: values[0] if (len(values) > 0) and values[0] != None else -1)
# [ATTRIBUTES] Extract biologic,biodinamic, vegan and gluten information
all_products['biologic'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_biological"])
all_products['biologic'] = all_products['biologic'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['vegan'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_vegan"])
all_products['vegan'] = all_products['vegan'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['biodinamic'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_biodinamic"])
all_products['biodinamic'] = all_products['biodinamic'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
all_products['gluten_free'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_generalfood_glutenfree"])
all_products['gluten_free'] = all_products['gluten_free'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
# [ATTRIBUTES] is for adult
all_products['isforadult'] = all_products['attributes'].apply(
lambda attributes: [att['value'] for att in attributes
if att['attribute']['name'] == "indaco_general_isforadult"])
all_products['isforadult'] = all_products['isforadult'].apply(
lambda attribute: 0 if len(attribute)==0 else attribute[0])
# [Warehouse]
all_products['warehouse_id'] = all_products['inventoryLevels'].apply(
lambda items: [item['warehouseId'] for item in items if 'warehouseId' in item])
# Convert the production areas codes into names
file_path = path.join('_library', 'INDACO_collectionCodes.json')
_, collectionTypes = load_collectionTypes(file_path, verbose = False)
productionArea_codes = collectionTypes['production_areas']
all_products['production_areas'] = all_products['production_areas'].apply(
lambda area_code:
productionArea_codes[str(area_code)].capitalize()
if str(area_code) in productionArea_codes.keys() else f"Unknown (code:{area_code})"
if area_code != -1 else ""
)
# Filter and keep only the interest column
selected_columns = ['title', 'sku', 'brand', 'refrigerated', 'weight [grams]',
'_id', 'categoryId', 'partnerId', 'warehouse_id', 'production_areas','biologic','vegan',
'biodinamic','gluten_free','isforadult']
all_products = all_products[selected_columns]
return all_products
def enhanced_products_df(products, categories, sellers, warehouses, orders):
products = products.merge(sellers, left_on = "partnerId", right_on = "_id")
enhanced_df = products.merge(categories, how = 'left', left_on = "categoryId", right_on = "_id")
# Retrieve the warehouse names
warehouse_names = {item['_id']: item['name'] for item in warehouses.to_dict(orient = 'records')}
enhanced_df['warehouses'] = enhanced_df['warehouse_id'].apply(
lambda warehouses: [warehouse_names[warehouse_id] for warehouse_id in warehouses])
# Fill the categories
# 1) Manual mapping
manual_catMapping = {
'affogato di sabbionara': 'Formaggio',
'casolét val di sole': 'Formaggio',
'cuor di fassa': 'Formaggio',
'fontal di cavalese': 'Formaggio',
'mezzano trentino di alta montagna': 'Formaggio',
'primiero fresco': 'Formaggio',
'puzzone di moena dop': 'Formaggio',
'trentingrana 1kg': 'Formaggio',
'trentingrana 250gr': 'Formaggio'
}
nanCond = enhanced_df['categoryName'].isnull()
enhanced_df.loc[nanCond, 'categoryName'] = enhanced_df.loc[nanCond, 'title'].str.lower().apply(
lambda product_name: manual_catMapping[product_name] if product_name in manual_catMapping.keys() else np.nan)
# 2) Fill with the default name
emptyCategory_value = ''
enhanced_df['categoryName'] = enhanced_df['categoryName'].fillna(value = emptyCategory_value)
# 3) Fill the google id
enhanced_df['googleId'] = enhanced_df['googleId'].fillna(value = -1)
# Select only useful columns
enhanced_df.drop(columns = ['_id_y', 'isLeaf'], inplace = True)
enhanced_df.rename(columns = {'_id_x': '_id'}, inplace = True)
# Visualize potential missing products
if len(enhanced_df) != len(products):
discarted_products = np.setdiff1d(products["title"].to_numpy(),
enhanced_df["title"].to_numpy())
print(f'{len(discarted_products)} products have been discarted due to an issue with the "categoryId"')
print("-" * 75)
print("-->", "\n--> ".join(sorted(discarted_products)))
item_w_unknownCategory = sorted(enhanced_df.loc[enhanced_df['categoryName'] == emptyCategory_value, 'title'].tolist())
if len(item_w_unknownCategory) > 0:
print("-" * 70, "\n" + "-" * 70)
print(f'[WARNING] The attribute "categoryId" has not been set to {len(item_w_unknownCategory)} products. \n'\
f'It has been filled with a category equal to "{emptyCategory_value}"')
print("-" * 70, "\n" + "-" * 70)
print("-->", "\n--> ".join(['(' + str(idk + 1) + ') '+ item
for idk, item in enumerate(item_w_unknownCategory)]))
# Mapping the column names
enhanced_df.rename(
columns = {
'title': 'Title',
'sku': 'indaco_sku',
'brand' : 'Vendor',
'categoryName': 'Product Type',
'googleId': 'Type id',
'companyName': 'Seller',
'_id': 'productId'},
inplace = True)
# [NEW ATTRIBUTE] inTrentino flag
enhanced_df['inTrentino_source'] = enhanced_df['Seller'].apply(lambda name: True if name == "Vendi24" else False)
# [NEW ATTRIBUTE] Item frequency within transactions/orders
num_transactions = len(orders['Transaction id'].unique())
normalizedFreq_func = lambda sku: len(orders.loc[orders['sku'] == sku, 'Transaction id'].unique()) / num_transactions
enhanced_df['Frequency'] = enhanced_df['indaco_sku'].apply(lambda sku: np.round(normalizedFreq_func(sku), 4))
# Minor normalization
#enhanced_df['indaco_sku'] = enhanced_df['indaco_sku'].str.capitalize()
enhanced_df['productId'] = enhanced_df['productId'].astype(str)
enhanced_df['Vendor'] = enhanced_df['Vendor'].fillna(value = "")
# Reorder columns
relevant_columns = ['Vendor', 'Title', 'Product Type', 'Type id', 'refrigerated', 'weight [grams]', 'warehouses',
'Seller', 'production_areas', 'inTrentino_source', 'Frequency', 'indaco_sku','biologic',
'isforadult','vegan','biodinamic','gluten_free'] #,'productId'
enhanced_df = enhanced_df[relevant_columns]
return enhanced_df
def simplified_SKUs(indaco_products, product_identifier):
print("\n" + 120 * "-")
print("-" * 39, "Simplify the SKU", "-" * 38)
print(120 * "-", "\n")
# Fill the new products
indaco_products['SKU'] = indaco_products['indaco_sku'].apply(lambda indaco_sku: indaco_sku.split('-')[0].upper())
# Avoid duplicated simplified SKUs by including the second part
duplicated_products = indaco_products[indaco_products['SKU'].duplicated(keep = False)].index
indaco_products.loc[duplicated_products, 'SKU'] = indaco_products.loc[duplicated_products,'indaco_sku'].apply(
lambda indaco_sku: ''.join(indaco_sku.split('-')[:2]).upper())
# Product names
product_names = indaco_products[product_identifier].tolist()
# Sku mapping
sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
return indaco_products, product_names, sku_mapping