Your IP : 216.73.217.13


Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/
Upload File :
Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/mongodb_utils.py

import numpy as np
import pandas as pd
import pymongo
from bson import ObjectId
from collections import defaultdict
from os import path
from json import dumps, load

from components.DbService import DbService

def connect_to_mongodb(hostname, port, user, password, db_name, verbose = False):
    
    print("\n" + 120 * "-")
    print("-" * 42,f"Connecting to MongoDB ({hostname})", "-" * 41)
    print(120 * "-", "\n")

    # Connect the the MongoDB server
    db_server = pymongo.MongoClient(hostname, port, username=user, password=password, authSource=db_name, authMechanism='SCRAM-SHA-256')

    # Connect tho the database
    indaco_db = db_server[db_name]
    
    if verbose:
        collection_names = indaco_db.list_collection_names()
        print(f"COLLECTIONS ({len(collection_names)}):", ', '.join(collection_names), "\n")
    
    return indaco_db

def extractCategories(category_collection, language = 'it-IT'):
    all_types = dict()
    type_mapping = dict()
    
    for category in category_collection:
        category_name = [name_item['label'] for name_item in category['name'] if name_item['code'] == language][0]
        
        type_mapping[category['_id']] = category_name
        
        all_types[category_name] = {
            'googleId': -1 if category['googleId'] is None else int(category['googleId']), 
            'isLeaf':category['isLeaf'],
            '_id': category['_id'],
            'parentId': category['parentId']
            }
    
    # Retrive the category name of the parent 
    for type_metadata in all_types.values():
        item_parentId = type_metadata.pop('parentId')
        
        if item_parentId:
            type_metadata['parent'] = type_mapping[item_parentId]
        else:
            type_metadata['parent'] = None  
    
    all_types_df = pd.DataFrame().from_dict(all_types, orient = 'index')
    all_types_df = all_types_df.reset_index()
    all_types_df = all_types_df.rename(columns = {'index': 'categoryName'})

    return all_types_df

def extractProducts(product_collection, consider_delatedProducts = False, consider_unavailableProducts = False,verbose = False):
    
    print("\n" + 90 * "-")
    print("-" * 29,f"Extracting the INDACO products", "-" * 29)
    print(90 * "-", "\n")
    
    # Get all products
    all_products = pd.DataFrame(product_collection)
    
    # Improve readability of some attributes 
    all_products['title'] = all_products['title'].str.strip()
    all_products['brand'] = all_products['brand'].str.strip()
    all_products.rename(columns = {'weight': 'weight [grams]'}, inplace=True)
    
    # [FLAG: deleted] Consider or not the products that have been delated
    if not consider_delatedProducts:
        delated_products = all_products.loc[all_products['deleted'] == True, 'title']
        all_products = all_products.drop(index = delated_products.index)
        
        print(f'\nDelated products ({len(delated_products)}):', ', '.join(delated_products.values))
        
    # [FLAG: sellBelowZero] Skipped products that are not available
    if not consider_unavailableProducts:
        unavailable_products = all_products.loc[all_products['sellBelowZero'] == False, 'title']
        all_products = all_products.drop(index = unavailable_products.index).reset_index(drop = True)
        print(f'Unavailable products ({len(unavailable_products)}):', ', '.join(unavailable_products.values), '\n')
    
    if verbose:
        print(f"ATTRIBUTES ({len(all_products.columns)}):")
        print('-->', '\n--> '.join(all_products.columns), "\n")
    
    # [ATTRIBUTES] Extract the production areas
    all_products['production_areas'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_general_productionarea"])
    all_products['production_areas'] = all_products['production_areas'].apply(
        lambda values: values[0] if (len(values) > 0) and values[0] != None else -1)
    
    # [Warehouse]
    all_products['warehouse_id'] = all_products['inventoryLevels'].apply(
        lambda items: [item['warehouseId'] for item in items if 'warehouseId' in item])

    # [ATTRIBUTES] Extract biologic,biodinamic, vegan and gluten information

    all_products['biologic'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_biological"])
    all_products['biologic'] = all_products['biologic'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    all_products['vegan'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_vegan"])
    all_products['vegan'] = all_products['vegan'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    all_products['biodinamic'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_biodinamic"])
    all_products['biodinamic'] = all_products['biodinamic'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])

    all_products['gluten_free'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_glutenfree"])
    all_products['gluten_free'] = all_products['gluten_free'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    # [ATTRIBUTES] is for adult

    all_products['isforadult'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_general_isforadult"])
    all_products['isforadult'] = all_products['isforadult'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])

    # Convert the production areas codes into names
    file_path = path.join('_library', 'INDACO_collectionCodes.json')
    with open(file_path) as json_file:
        collectionTypes = load(json_file)
        productionArea_codes = collectionTypes['production_areas']
    
        all_products['production_areas'] = all_products['production_areas'].apply(
            lambda area_code: 
                productionArea_codes[str(area_code)].capitalize() 
                if str(area_code) in productionArea_codes.keys() else f"Unknown (code:{area_code})"
                if area_code != -1 else "")
    
    # Filter and keep only the interest column
    selected_columns = ['title', 'sku', 'brand', 'refrigerated', 'weight [grams]', 
                        '_id', 'categoryId', 'partnerId', 'warehouse_id', 'production_areas','biologic','vegan',
                        'biodinamic','gluten_free','isforadult']
    all_products = all_products[selected_columns]
    
    return all_products

def enhanced_products_df(products, categories, sellers, warehouses, orders):
    products = products.merge(sellers, left_on = "partnerId", right_on = "_id", suffixes = ('', '_seller'))
    enhanced_df = products.merge(categories, how = 'left', left_on = "categoryId", right_on = "_id", suffixes = ('', '_category'))
    enhanced_df.drop(columns = ['_id_seller', '_id_category', 'isLeaf'], inplace = True)    
    
    # Retrieve the warehouse names
    warehouse_names = {item['_id']: item['name'] for item in warehouses.to_dict(orient = 'records')}
    enhanced_df['warehouses'] = enhanced_df['warehouse_id'].apply(
        lambda warehouses: [warehouse_names[warehouse_id] for warehouse_id in warehouses])
    
    # 2) Fill with the default name
    emptyCategory_value = ''
    enhanced_df['categoryName'] = enhanced_df['categoryName'].fillna(value = emptyCategory_value)

    # 3) Fill the google id
    enhanced_df['googleId'] = enhanced_df['googleId'].fillna(value = -1)
    
    # Visualize potential missing products
    if len(enhanced_df) != len(products): 
        discarted_products = np.setdiff1d(products["title"].to_numpy(),
                                          enhanced_df["title"].to_numpy())
        
        print(f'{len(discarted_products)} products have been discarted due to an issue with the "categoryId"')
        print("-" * 75)
        print("-->", "\n--> ".join(sorted(discarted_products)))
        
    item_w_unknownCategory = sorted(enhanced_df.loc[enhanced_df['categoryName'] == emptyCategory_value, 'title'].tolist())
    if len(item_w_unknownCategory) > 0:
        print("-" * 70, "\n" + "-" * 70)
        print(f'[WARNING] The attribute "categoryId" has not been set to {len(item_w_unknownCategory)} products. \n'\
            f'It has been filled with a category equal to "{emptyCategory_value}"')
        print("-" * 70, "\n" + "-" * 70)
        print("-->", "\n--> ".join(['(' + str(idk + 1) + ') '+ item 
                                           for idk, item in enumerate(item_w_unknownCategory)]))
    
    # Mapping the column names
    enhanced_df.rename(
        columns = {
            'title': 'Title',
            'sku': 'indaco_sku',
            'brand' : 'Vendor',
            'categoryName': 'Product Type',
            'googleId': 'Type id',
            'companyName': 'Seller',
            '_id': 'productId'}, 
        inplace = True)
    
    # [NEW ATTRIBUTE] inTrentino flag
    enhanced_df['inTrentino_source'] = enhanced_df['Seller'].apply(lambda name: True if name  == "Vendi24" else False)
    
    # [NEW ATTRIBUTE] Item frequency within transactions/orders
    num_transactions = len(orders['Transaction id'].unique())
    normalizedFreq_func =  lambda sku: len(orders.loc[orders['sku'] == sku, 'Transaction id'].unique()) / num_transactions
    enhanced_df['Frequency'] = enhanced_df['indaco_sku'].apply(lambda sku: np.round(normalizedFreq_func(sku), 4))
   
    # Minor normalization
    #enhanced_df['indaco_sku'] = enhanced_df['indaco_sku'].str.capitalize()
    enhanced_df['productId'] = enhanced_df['productId'].astype(str) 
    enhanced_df['Vendor'] = enhanced_df['Vendor'].fillna(value = "")
    
    # Reorder columns 
    relevant_columns = ['Vendor', 'Title', 'Product Type', 'Type id', 'refrigerated', 'weight [grams]', 'warehouses', 
                        'Seller', 'production_areas', 'inTrentino_source', 'Frequency', 'indaco_sku','biologic',
                        'isforadult','vegan','biodinamic','gluten_free'] #,'productId'
    enhanced_df = enhanced_df[relevant_columns]

    return enhanced_df

def enhanceCustomerProfiles(userProfiles, platfromProducts, platfromCategories):
    
    file_path = path.join('_library', 'INDACO_collectionCodes.json')
    with open(file_path) as json_file:
        collectionTypes = load(json_file)
        productionArea_codes = collectionTypes['production_areas']
        
        # Retrieve information: (A) production_areas
        userProfiles['production_areas'] = userProfiles['production_areas'].dropna().apply(
            lambda productionAreasCodes: [productionArea_codes[code] for code in productionAreasCodes])
    
    # Retrieve information: (B) categories
    
    category_mapping = platfromCategories[['_id', 'categoryName']]
    category_mapping.index = category_mapping['_id']
    category_mapping = category_mapping['categoryName'].to_dict()
    
    userProfiles['categories'] = userProfiles['categories'].apply(
        lambda categoryIds: sorted([category_mapping[ObjectId(categoryId)] for categoryId in categoryIds]))

    # Retrieve information: (C) unique_products   
    product_mapping = platfromProducts[['_id', 'sku']]
    product_mapping.index = product_mapping['_id'].map(str)
    product_mapping = product_mapping['sku'].to_dict()
    userProfiles['unique_products'] = userProfiles['unique_products'].apply(
        lambda productIds: sorted([product_mapping[str(productId)] for productId in productIds]))
    
    # Retrieve information: (D) shopping_baskets
    userProfiles['shopping_baskets'] = userProfiles['shopping_baskets'].apply(
        lambda shopping_baskets: {trasactionId: sorted([product_mapping[str(productId)] for productId in shopping_basket]) 
                                    for trasactionId, shopping_basket in shopping_baskets.items()})
                
    return userProfiles

def map_indacoSKUs(indaco_products, inTrentino_products, product_identifier):
    print("\n" + 120 * "-") 
    print("-" * 39, "Mapping the OLD SKUs with the INDACO SKUs", "-" * 38)
    print(120 * "-", "\n")

    old_SKUs = inTrentino_products['SKU'].map(str.lower).values

    mapping_func = lambda new_sku: [old_sku for old_sku in old_SKUs if old_sku in new_sku]
    mapped_SKUs = indaco_products['indaco_sku'].map(str.lower).apply(mapping_func)
    indaco_products.insert(loc = 1, column = 'SKU', value = mapped_SKUs)
    
    # Visualize new products
    idk_new_products = indaco_products[indaco_products['SKU'].apply(len) == 0].index 
    
    # Fill the new products
    indaco_products['SKU'] = indaco_products.apply(
        func = lambda df_row: df_row['SKU'][0].upper() if len(df_row['SKU']) == 1 \
            else df_row['indaco_sku'].split('-')[0].upper(), 
        axis = 1)
    
    # Visualize new products
    new_products  = indaco_products.loc[idk_new_products, ['Title', 'Product Type', 'SKU']]\
        .sort_values(by = ['Product Type', 'Title'])

    print("-" * 90, "\n" + "-" * 90)
    print("-" * 25, f'[INFO] {len(new_products)} new products have been found', "-" * 25)
    print("-" * 90, "\n" + "-" * 90)
    print("-->", "\n--> ".join([f"({idk + 1}) [{item['Product Type']}] {item['Title']} ({item['SKU']})"
                                        for idk, item in new_products.iterrows()]))
    print("-" * 90, "\n" + "-" * 90)
    print("-" * 90, "\n" + "-" * 90, "\n")
    
    # Also in inTrentino
    unmatchedProducts = np.intersect1d(new_products['Title'].str.capitalize().values, 
                                      inTrentino_products['Title'].str.capitalize().values)
    print(f"\nMISMATCHED SKUs ({len(unmatchedProducts)}):\n-->", '\n--> '.join(unmatchedProducts))
       
    # Product names
    product_names = indaco_products[product_identifier].tolist()
    
    # Sku mapping
    sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
    
    return indaco_products, product_names, sku_mapping

def simplified_SKUs(indaco_products, product_identifier):
    print("\n" + 120 * "-") 
    print("-" * 39, "Simplify the SKU", "-" * 38)
    print(120 * "-", "\n")
    
    # Fill the new products
    indaco_products['SKU'] = indaco_products['indaco_sku'].apply(lambda indaco_sku: indaco_sku.split('-')[0].upper())

    # Avoid duplicated simplified SKUs by including the second part
    duplicated_products = indaco_products[indaco_products['SKU'].duplicated(keep = False)].index
    indaco_products.loc[duplicated_products, 'SKU'] = indaco_products.loc[duplicated_products,'indaco_sku'].apply(
        lambda indaco_sku: ''.join(indaco_sku.split('-')[:2]).upper())
    
    # Product names
    product_names = indaco_products[product_identifier].tolist()
    
    # Sku mapping
    sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
    
    return indaco_products, product_names, sku_mapping

def find_bestExplaination(bundle_explainations):
    if len(bundle_explainations) == 0:
        return ""
    
    unique_explainations, counter = np.unique(bundle_explainations, return_counts = True)
    
    # Find the most frequent explaination
    most_frequent_explainations = unique_explainations[np.argwhere(counter == np.max(counter)).reshape(-1)]

    # Apprach A: pick the most frequent item 
    if len(most_frequent_explainations) == 1:
        most_frequent_explaination = most_frequent_explainations[0] 
    else:
        
        # Approach B: Pick up the explaination included in the recommendations with lower ranks (i.e., most generic explaination)
        explainations_meanRankPos = {explaination: np.mean(np.where(bundle_explainations == explaination))
                                    for explaination in most_frequent_explainations}
        explainations_meanRankPos = dict(sorted(explainations_meanRankPos.items(), key = lambda item: item[1], reverse = False))
        most_frequent_explaination = list(explainations_meanRankPos.keys())[0]
        
    return most_frequent_explaination


def prepareData_for_saving(recommendations_byUser, sku_mapping, verbose):
    recommendations_byUser = dict(sorted(recommendations_byUser.items()))
    
    db_recommendations_byProduct = defaultdict(dict)
    for user_id, recommendations_byProduct in recommendations_byUser.items():
            
        for product_sku, linked_products in recommendations_byProduct.items():
            
            if verbose:
                print("user_id:", user_id)
                print(f"PRODUCT: {product_sku} --> {len(linked_products)} linked products")
            
            if user_id == -2:
                allProducts = [sku_mapping[sku_product] for sku_product in linked_products]
                db_recommendations_byProduct[product_sku]['all_linkedProducts'] = sorted(list(set(allProducts)))
                db_recommendations_byProduct[product_sku]['linkedProducts_permutations'] = dict()
            
            else:
                
                # Retrieve the linked products
                all_linkedProducts = db_recommendations_byProduct[product_sku]['all_linkedProducts']
                # Retrieve the personalized order of the product (i.e., skus)
                personalized_skuOrder = [sku_mapping[product['item_sku']] for product in linked_products]
                
                # retrieve the explaination of the bundle
                bundle_explainations = np.array([product['explaination'] for product in linked_products])
                personalized_explaination = find_bestExplaination(bundle_explainations)
                
                if verbose:
                    print(f"\nSTANDARD ({len(all_linkedProducts)}):", dumps(all_linkedProducts, indent = 4))
                    print(f"\nPERSONALIZED ({len(personalized_skuOrder)}) [user: {user_id}]:", 
                        dumps(personalized_skuOrder, indent = 4), "\n")
    
                if user_id == -1:
                    user_id = 'generic_user'
                
                # Generate the personalized permutation
                user_permutation = [all_linkedProducts.index(sku) for sku in personalized_skuOrder]
                db_recommendations_byProduct[product_sku]['linkedProducts_permutations'][user_id] = {
                    'product_permutation':user_permutation, 
                    'explaination': personalized_explaination}     

    return db_recommendations_byProduct
 
def write_recommendations_to_mongodb(db_e,recommendations_byUser, sku_mapping, dropExistingCollection = True, verbose = False):
    print("\n" + 90 * "-")
    print("-" * 35,f"Writing to MongoDB", "-" * 35)
    print(90 * "-")
    
    # Change the data representation and prepare data to be writen
    db_recommendations_byProduct = prepareData_for_saving(recommendations_byUser, sku_mapping, verbose)
            
    # Connect to the db
    db = DbService("mongodb")
    db.ENCRYPTING_KEY = db_e.ENCRYPTING_KEY
    # Decrypt user ids
    decryptedUsers = db_e.get_decryptedUsers(recommendations_byUser.keys())
    
    # Collection name in which the recommendations is going to be saved
    recommendation_collectionName = 'productbasedrecommendations'
    
    if dropExistingCollection:
        db.drop_existingTable(recommendation_collectionName)
        print(f'Overwriting the existing collection "{recommendation_collectionName}"...\n')
    else:
        print(f'Writing the recommendations to the new collection "{recommendation_collectionName}"...\n')
    
    # Write the item to the mongoDB
    for reference_sku, product_info in db_recommendations_byProduct.items():

        # 0) Retrieve the object id of the reference item 
        referenceProduct_indacoId = db.get_dBproduct(reference_sku, as_dict = True)
        if not referenceProduct_indacoId:
            continue
        referenceProduct_indacoId = referenceProduct_indacoId['_id']

        # 1) Retrieve the object ids of all the prodocuts
        linked_ids = []
        for item_sku in product_info['all_linkedProducts']:
            product_indacoId = db.get_dBproduct(item_sku, as_dict = True)
    
            if product_indacoId:
                product_indacoId = product_indacoId['_id']
                linked_ids.append(product_indacoId)
                
        # 2) Map the decrypted user ids
        userPermutations = dict()
        for user_id, user_permutation in product_info['linkedProducts_permutations'].items():

            # Decrypt the user id
            if user_id in decryptedUsers.keys():
                decrypted_userId = decryptedUsers[user_id]
            else:
                decrypted_userId = user_id 
            userPermutations[decrypted_userId] = user_permutation
        
        # 3) Write the item to the mongoDB
        new_dbItem = {'product_id': referenceProduct_indacoId, 'all_linkedProducts' : linked_ids, 'linkedProducts_permutations': userPermutations}
        db.write_newDbItem(recommendation_collectionName, new_dbItem)

    print(f"\nFinished, linked products added to {len(db_recommendations_byProduct)} products.\n")
   

def write_recommendations_to_mongodb_legacy(recommendations_byProduct, sku_mapping, verbose = False):

    # Retrieve the linked products (SKUs) and the explainations
    db_recommendations_byProduct = dict()
    for reference_name, recommendations in recommendations_byProduct.items():
        indaco_sku = sku_mapping[reference_name]
        
        db_recommendations_byProduct[indaco_sku] = defaultdict(list)
        for recommendation in recommendations:
            recom_indaco_sku = sku_mapping[recommendation['item_sku']]
            
            # Retrive the similar products
            if "similar_products" in recommendation.keys():
                similar_products = [sku_mapping[sku_product] for sku_product in recommendation["similar_products"].keys()]
            else:
                similar_products = []
            
            enveloped_recom = {'sku': recom_indaco_sku,  'similar_products': similar_products}
            
            # Append the recommendations
            db_recommendations_byProduct[indaco_sku]['linked_products'].append(enveloped_recom)
            db_recommendations_byProduct[indaco_sku]['explainations'].append(recommendation['explaination'])
        
        # Find the best explaination for the bundle
        bundle_explainations = np.array(db_recommendations_byProduct[indaco_sku].pop('explainations'))
        personalized_explaination = find_bestExplaination(bundle_explainations)
        db_recommendations_byProduct[indaco_sku]['explaination'] = personalized_explaination

    # Connect to the db
    db = DbService("mongodb")
    
    # Write to the mongoDB
    print("[LEGACY APPROACH] Writing linked products...\n")
    for reference_sku, recommendations in db_recommendations_byProduct.items():
        
        # Get info 
        linked_products = recommendations['linked_products']
        explaination = recommendations['explaination']

        # Retrieve the ids of the linked products
        linked_ids = []
        for idk, linked_product in enumerate(linked_products):
            
            # Retrieve the product ID
            item_sku = linked_product['sku']
            try: 
                product_indacoId = db.get_dBproduct(item_sku, as_dict = True)['_id']
            except Exception:
                print(f"{item_sku} not found in the database!")
                continue
            
            # Retrieve the product IDs of the similar products
            try:
                similar_product_ids = [db.get_dBproduct(item_sku, as_dict = True)['_id'] 
                                       for item_sku in linked_product['similar_products']]
            except Exception:
                similar_product_ids = []
                print(f"[SIMILAR PRODUCTS] {item_sku} not found in the database!")
                print("-->", product_indacoId)
            
            #if product_indacoId != None:
            db_product = {'rank': idk + 1, 'productId': product_indacoId, 'similarProducts': similar_product_ids}
            linked_ids.append(db_product)
            
        # Save the linked products (IDs)
        db.update_attributeDbItem(object = reference_sku, attribute_name = "linkedProducts", 
                                  attribute_value = linked_ids)
        
        # Save the explaination
        db.update_attributeDbItem(object = reference_sku, attribute_name = "recomExplanation", 
                                  attribute_value = explaination) 
        if verbose:
            print("\nREFERENCE:", reference_sku)
            print(f"--> LINKED SKUs ({len(linked_products)}):-->"
                  , '\n--> '.join([item['sku'] + ': ' + ','.join(linked_product['similar_products'])  
                                  for item in linked_products]))
            print("\n--> EXPLAINATION:", explaination)
            print("\n--> Linked ids:", linked_ids)
            
    if not verbose: 
        print(f"Finished, linked products added to {len(db_recommendations_byProduct)} products.")