Your IP : 216.73.217.13


Current Path : /home/deltalab/PMS/recommendations/user_profiling/_library/
Upload File :
Current File : //home/deltalab/PMS/recommendations/user_profiling/_library/mongodb_utils.py

import numpy as np
import pandas as pd
import pymongo
from os import path
from _library.io_toolkit import load_collectionTypes

def connect_to_mongodb(hostname, port, user, password, db_name, verbose = False):
    
    print("\n" + 120 * "-")
    print("-" * 42,f"Connecting to MongoDB ({hostname})", "-" * 41)
    print(120 * "-", "\n")
    
    # Connect the the MongoDB server
    db_server = pymongo.MongoClient(hostname, port, username=user, password=password, authSource=db_name, authMechanism='SCRAM-SHA-256')

    # Connect tho the database
    indaco_db = db_server['pms']
    
    if verbose:
        collection_names = indaco_db.list_collection_names()
        print(f"COLLECTIONS ({len(collection_names)}):", ', '.join(collection_names), "\n")
    
    return indaco_db

def extractCategories(category_collection, language = 'it-IT'):
    all_types = dict()
    type_mapping = dict()
    for category in category_collection:
        category_name = [name_item['label'] for name_item in category['name'] if name_item['code'] == language][0]
        
        type_mapping[category['_id']] = category_name
        
        all_types[category_name] = {
            'googleId': -1 if category['googleId'] is None else int(category['googleId']), 
            'isLeaf':category['isLeaf'],
            '_id': category['_id'],
            'parentId': category['parentId']
            }
    
    # Retrive the category name of the parent 
    for type_metadata in all_types.values():
        item_parentId = type_metadata.pop('parentId')
        
        if item_parentId:
            type_metadata['parent'] = type_mapping[item_parentId]
        else:
            type_metadata['parent'] = None  
    
    all_types_df = pd.DataFrame().from_dict(all_types, orient = 'index')
    all_types_df = all_types_df.reset_index()
    all_types_df = all_types_df.rename(columns = {'index': 'categoryName'})

    return all_types_df

def extractProducts(product_collection, consider_delatedProducts = False, consider_unavailableProducts = False, verbose = False):
    
    print("\n" + 90 * "-")
    print("-" * 29,f"Extracting the INDACO products", "-" * 29)
    print(90 * "-", "\n")
    
    # Get all products
    all_products = pd.DataFrame(product_collection)
    
    # Improve readability of some attributes 
    all_products['title'] = all_products['title'].str.strip()
    all_products['brand'] = all_products['brand'].str.strip()
    all_products.rename(columns = {'weight': 'weight [grams]'}, inplace=True)
    
    # [FLAG: deleted] Consider or not the products that have been delated
    if not consider_delatedProducts:
        delated_products = all_products.loc[all_products['deleted'] == True, 'title']
        all_products = all_products.drop(index = delated_products.index)
        
        print(f'\nDelated products ({len(delated_products)}):', ', '.join(delated_products.values))
        
    # [FLAG: sellBelowZero] Skipped products that are not available
    if not consider_unavailableProducts:
        unavailable_products = all_products.loc[all_products['sellBelowZero'] == False, 'title']
        all_products = all_products.drop(index = unavailable_products.index).reset_index(drop = True)
        print(f'Unavailable products ({len(unavailable_products)}):', ', '.join(unavailable_products.values), '\n')
    
    if verbose:
        print(f"ATTRIBUTES ({len(all_products.columns)}):")
        print('-->', '\n--> '.join(all_products.columns), "\n")
    
    # [ATTRIBUTES] Extract the production areas
    all_products['production_areas'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_general_productionarea"])
    all_products['production_areas'] = all_products['production_areas'].apply(
        lambda values: values[0] if (len(values) > 0) and values[0] != None else -1)
    
    # [ATTRIBUTES] Extract biologic,biodinamic, vegan and gluten information

    all_products['biologic'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_biological"])
    all_products['biologic'] = all_products['biologic'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    all_products['vegan'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_vegan"])
    all_products['vegan'] = all_products['vegan'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    all_products['biodinamic'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_biodinamic"])
    all_products['biodinamic'] = all_products['biodinamic'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])

    all_products['gluten_free'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_generalfood_glutenfree"])
    all_products['gluten_free'] = all_products['gluten_free'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    # [ATTRIBUTES] is for adult

    all_products['isforadult'] = all_products['attributes'].apply(
        lambda attributes: [att['value'] for att in attributes 
                            if att['attribute']['name'] == "indaco_general_isforadult"])
    all_products['isforadult'] = all_products['isforadult'].apply(
        lambda attribute: 0 if len(attribute)==0 else attribute[0])


    # [Warehouse]
    all_products['warehouse_id'] = all_products['inventoryLevels'].apply(
        lambda items: [item['warehouseId'] for item in items if 'warehouseId' in item])
    
    # Convert the production areas codes into names
    file_path = path.join('_library', 'INDACO_collectionCodes.json')
    _, collectionTypes  = load_collectionTypes(file_path, verbose = False)
    productionArea_codes = collectionTypes['production_areas']

    all_products['production_areas'] = all_products['production_areas'].apply(
        lambda area_code: 
            productionArea_codes[str(area_code)].capitalize() 
            if str(area_code) in productionArea_codes.keys() else f"Unknown (code:{area_code})"
            if area_code != -1 else ""
    )
    
    # Filter and keep only the interest column
    selected_columns = ['title', 'sku', 'brand', 'refrigerated', 'weight [grams]', 
                        '_id', 'categoryId', 'partnerId', 'warehouse_id', 'production_areas','biologic','vegan',
                        'biodinamic','gluten_free','isforadult']
 
    all_products = all_products[selected_columns]
    
    return all_products

def enhanced_products_df(products, categories, sellers, warehouses, orders):
    products = products.merge(sellers, left_on = "partnerId", right_on = "_id")
    enhanced_df = products.merge(categories, how = 'left', left_on = "categoryId", right_on = "_id")
    
    # Retrieve the warehouse names
    warehouse_names = {item['_id']: item['name'] for item in warehouses.to_dict(orient = 'records')}
    enhanced_df['warehouses'] = enhanced_df['warehouse_id'].apply(
        lambda warehouses: [warehouse_names[warehouse_id] for warehouse_id in warehouses])
    
    # Fill the categories 
    # 1) Manual mapping
    manual_catMapping = {
        'affogato di sabbionara': 'Formaggio',
        'casolét val di sole': 'Formaggio',
        'cuor di fassa': 'Formaggio',
        'fontal di cavalese': 'Formaggio',
        'mezzano trentino di alta montagna': 'Formaggio',
        'primiero fresco': 'Formaggio',
        'puzzone di moena dop': 'Formaggio',
        'trentingrana 1kg': 'Formaggio',
        'trentingrana 250gr': 'Formaggio'
    }
    nanCond = enhanced_df['categoryName'].isnull()
    enhanced_df.loc[nanCond, 'categoryName'] = enhanced_df.loc[nanCond, 'title'].str.lower().apply(
        lambda product_name: manual_catMapping[product_name] if product_name in manual_catMapping.keys() else np.nan)
    
    # 2) Fill with the default name
    emptyCategory_value = ''
    enhanced_df['categoryName'] = enhanced_df['categoryName'].fillna(value = emptyCategory_value)

    # 3) Fill the google id
    enhanced_df['googleId'] = enhanced_df['googleId'].fillna(value = -1)
    
    # Select only useful columns
    enhanced_df.drop(columns = ['_id_y', 'isLeaf'], inplace = True)
    enhanced_df.rename(columns = {'_id_x': '_id'}, inplace = True)
    
    # Visualize potential missing products
    if len(enhanced_df) != len(products): 
        discarted_products = np.setdiff1d(products["title"].to_numpy(),
                                          enhanced_df["title"].to_numpy())
        
        print(f'{len(discarted_products)} products have been discarted due to an issue with the "categoryId"')
        print("-" * 75)
        print("-->", "\n--> ".join(sorted(discarted_products)))
        
    item_w_unknownCategory = sorted(enhanced_df.loc[enhanced_df['categoryName'] == emptyCategory_value, 'title'].tolist())
    if len(item_w_unknownCategory) > 0:
        print("-" * 70, "\n" + "-" * 70)
        print(f'[WARNING] The attribute "categoryId" has not been set to {len(item_w_unknownCategory)} products. \n'\
            f'It has been filled with a category equal to "{emptyCategory_value}"')
        print("-" * 70, "\n" + "-" * 70)
        print("-->", "\n--> ".join(['(' + str(idk + 1) + ') '+ item 
                                           for idk, item in enumerate(item_w_unknownCategory)]))
    
    # Mapping the column names
    enhanced_df.rename(
        columns = {
            'title': 'Title',
            'sku': 'indaco_sku',
            'brand' : 'Vendor',
            'categoryName': 'Product Type',
            'googleId': 'Type id',
            'companyName': 'Seller',
            '_id': 'productId'}, 
        inplace = True)
    
    # [NEW ATTRIBUTE] inTrentino flag
    enhanced_df['inTrentino_source'] = enhanced_df['Seller'].apply(lambda name: True if name  == "Vendi24" else False)
    
    # [NEW ATTRIBUTE] Item frequency within transactions/orders
    num_transactions = len(orders['Transaction id'].unique())
    normalizedFreq_func =  lambda sku: len(orders.loc[orders['sku'] == sku, 'Transaction id'].unique()) / num_transactions
    enhanced_df['Frequency'] = enhanced_df['indaco_sku'].apply(lambda sku: np.round(normalizedFreq_func(sku), 4))
   
    # Minor normalization
    #enhanced_df['indaco_sku'] = enhanced_df['indaco_sku'].str.capitalize()
    enhanced_df['productId'] = enhanced_df['productId'].astype(str) 
    enhanced_df['Vendor'] = enhanced_df['Vendor'].fillna(value = "")
    
    # Reorder columns 
    relevant_columns = ['Vendor', 'Title', 'Product Type', 'Type id', 'refrigerated', 'weight [grams]', 'warehouses', 
                        'Seller', 'production_areas', 'inTrentino_source', 'Frequency', 'indaco_sku','biologic',
                        'isforadult','vegan','biodinamic','gluten_free'] #,'productId'
    enhanced_df = enhanced_df[relevant_columns]

    return enhanced_df

def simplified_SKUs(indaco_products, product_identifier):
    print("\n" + 120 * "-") 
    print("-" * 39, "Simplify the SKU", "-" * 38)
    print(120 * "-", "\n")
    
    # Fill the new products
    indaco_products['SKU'] = indaco_products['indaco_sku'].apply(lambda indaco_sku: indaco_sku.split('-')[0].upper())

    # Avoid duplicated simplified SKUs by including the second part
    duplicated_products = indaco_products[indaco_products['SKU'].duplicated(keep = False)].index
    indaco_products.loc[duplicated_products, 'SKU'] = indaco_products.loc[duplicated_products,'indaco_sku'].apply(
        lambda indaco_sku: ''.join(indaco_sku.split('-')[:2]).upper())
    
    # Product names
    product_names = indaco_products[product_identifier].tolist()
    
    # Sku mapping
    sku_mapping = {item['SKU']: item['indaco_sku'] for item in indaco_products[['SKU', 'indaco_sku']].to_dict(orient = "records")}
    
    return indaco_products, product_names, sku_mapping