Your IP : 216.73.217.13


Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/
Upload File :
Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/data_loader.py

import json
from collections import defaultdict
from os import path,walk

import numpy as np
import pandas as pd
from tabulate import tabulate

from _library.data_utils import mongodb_utils, remoteConnection_utils
from _library.toolkit import preProcessing_collectionColumns
from components.DbService import DbService


def load_orders(order_file_name = 'OLDinTrentino_' + 'Orders_Set20_Giu22_enhanced_anonymized.xlsx'):
    orders_df = remoteConnection_utils.read_remote_dataframe(file_name = order_file_name)
    return orders_df

def load_inTrentino_products_OLD(excluded_products):
    product_identifier = "Title"
    
    # Location
    folder_path = path.join('indaco', 'inTrentino', 'data', 'ontology')
    file_name = 'prodotti_inTrentino.xlsx'
    
    # Load data
    inTrentino_products = remoteConnection_utils.read_generic_remote_file(folder_path, file_name)
    
    # Drop excluded products
    for item in excluded_products:
        cond = inTrentino_products[product_identifier].str.lower() == item.lower()
        obs_to_exclude = inTrentino_products.loc[cond, product_identifier]

        if len(obs_to_exclude) > 0:
            print(f"EXCLUDED: {' | '.join(obs_to_exclude.unique())}")
            inTrentino_products.drop(index = obs_to_exclude.index, inplace = True)
        print("-" * 60, "\n")
    
    if product_identifier in inTrentino_products.columns:
        # Sort products
        inTrentino_products = inTrentino_products.sort_values(by = product_identifier)
        
        # Retrieve names
        product_names = inTrentino_products[product_identifier].tolist()
    else:
        product_names = []
    
    return inTrentino_products, product_names

def get_localFolderFiles(prefix = None):
    file_names = dict()
    
    # Build the basic path
    dirname = path.dirname(__file__)
    files_path = path.join(dirname, '../../../recomsys-datapreparation-batch/_tmp')
    for (dir_path, dir_names, fileNames) in walk(files_path):
        for file_name in fileNames:
            print(file_name)
            if prefix != None:
                if file_name.startswith(prefix):
                    file_names[file_name] = dir_path+"/"+file_name
            else:
                file_names[file_name] = dir_path+"/"+file_name

    
            
    print(f"Files ({len(file_names.keys())}):")
    print('-' * 40, "\n")
            

    
    return file_names

def read_generic_local_file(fileName):
    file_extension = fileName.split(".")[-1]
    dirname = path.dirname(__file__)
    files_path = path.join(dirname, '../../../recomsys-datapreparation-batch/_tmp')
    with open(files_path+"/"+fileName,"rb") as file:
        if 'json' in file_extension:
            print('--> JSON file')
            loaded_file = json.load(file)
        elif 'xls' in file_extension:
            print('--> Excel file')
            loaded_file = pd.read_excel(file, sheet_name = None)
                        
            if len(loaded_file.keys()) > 1:
                print(f'--> Mutiple sheets ({len(loaded_file.keys())}):', ' | '.join(loaded_file.keys()))
            else:
                sheet_name = list(loaded_file.keys())[0]
                loaded_file = loaded_file[sheet_name]
                print("--> Single sheet:", sheet_name)
        else:
            print(file_extension, "is not supported!")
            return None
                    
        print(f"--> OK: The file ({fileName}) has been loaded correctly.\n")
        return loaded_file

def load_coPurchases(version_name = '', product_identifier = 'Title',load_remotely=True):
    
    # Folder path
    if(load_remotely):
        #remote folder path
        folder_path = path.join('indaco', 'inTrentino', 'data', 'co-purchases')
        print("FOLDER_PATH",folder_path)

    # Build up the file name
    file_name = f"coPurchases_by" + product_identifier.replace(' ', '') #'_Set20_Giu22.xlsx"
    if version_name or version_name != '':
        file_name = version_name + '_' + file_name

    # Find the actual names
    if(load_remotely):
        actual_fileNames = remoteConnection_utils.get_folderFiles(folder_path, prefix = file_name)
    else:
        print("Upload local")
        actual_fileNames = get_localFolderFiles(prefix = file_name)

    if len(actual_fileNames) == 0:
        print(f"ISSUE: File not found! {file_name}...")
    elif len(actual_fileNames) == 1:
        actual_fileName = list(actual_fileNames.keys())[0]
    else:
        sorted_fileNames = list(dict(sorted(actual_fileNames.items(), key = lambda item: item[1], reverse = True)).keys())
        actual_fileName = sorted_fileNames[0]
    
    if(load_remotely):
        # Read the multiple sheets
        sheets = remoteConnection_utils.read_generic_remote_file(folder_path = folder_path, file_name = actual_fileName)
    else:
        sheets = read_generic_local_file(actual_fileName)

    # Select only the relevant sheets
    association_rules_df = sheets['Association rules']
    enhanced_association_rules_df = sheets['Ass. rules with collections']
    
    return association_rules_df, enhanced_association_rules_df

def generate_platformData(orders_df, col_names, inTrentino_flag = True, excluded_products = []):
       
    transaction_identifier = "Transaction id"
    product_identifier = "Title"

    # Actual col names to select
    actual_col_names = []
    for col_name in col_names:
        if col_name in list(orders_df.columns):
            actual_col_names.append(col_name)
        else:
            print("-" * 40)
            print(f'COLUMN "{col_name}" not found!')
            print("-" * 40)
    orders_df = orders_df.sort_values(by = 'Transaction id', ascending = False) 
    
    # Normalize strings
    normalize_string = lambda string_name: string_name.capitalize() if isinstance(string_name, str) else string_name
    simplified_orders = orders_df[actual_col_names].applymap(normalize_string)
    
    # Retrieve the unique products
    platfrom_products = simplified_orders.drop_duplicates().reset_index(drop = True)

    # Compute their frequencies
    num_baskets = len(orders_df[transaction_identifier].unique())
    item_frequencies = simplified_orders[product_identifier].value_counts() / num_baskets
    item_frequencies = item_frequencies.rename('Frequency').round(decimals = 4).to_frame()
    
    # Add the inTrentino flag
    platfrom_products['inTrentino_source'] = inTrentino_flag

    # Merge the two views
    platfrom_products = platfrom_products.merge(item_frequencies, 
                                                left_on = product_identifier, 
                                                right_index = True)
    platfrom_products = platfrom_products.sort_values(by = product_identifier, ascending = True)
    platfrom_products = platfrom_products.reset_index(drop = True)
    
    for item in excluded_products:
        cond = platfrom_products[product_identifier].str.lower() == item.lower()
        obs_to_exclude = platfrom_products.loc[cond, product_identifier]

        if len(obs_to_exclude) > 0:
            print("-" * 40)
            print(f"EXCLUDED: {' | '.join(obs_to_exclude.unique())}")
            platfrom_products.drop(index = obs_to_exclude.index, inplace = True)
    print("-" * 40, "\n")
    
    # Delate duplicate items and artefacts
    if 'SKU' in platfrom_products.columns:
        platfrom_products = platfrom_products.dropna(axis = 0, subset = ['SKU'])
        platfrom_products.reset_index(drop = True, inplace = True)
    
    # Fille the empty links with empty lists
    linked_cols = [col for col in platfrom_products.columns if 'linked' in col.lower()]
    platfrom_products = preProcessing_collectionColumns(platfrom_products, linked_cols)

    return platfrom_products

def load_groupedCollectionNames():
    
    # Load the collection names
    grouped_collections = remoteConnection_utils.read_generic_remote_file(
        folder_path = path.join('indaco', 'inTrentino', 'data', 'ontology'), 
        file_name = "ProductCollections.json")
    
    # Visualize the collection groups
    print(f'Collection groups ({len(grouped_collections.keys())}):', 
          ', '.join(grouped_collections.keys()))
    
    return grouped_collections

def load_collectionTypes(filepath, verbose = True):
    rawCollectionTypes = dict()
    
    with open(filepath) as json_file:
        rawCollectionTypes = json.load(json_file)
    
    # Keep and extract the collection names
    collectionTypes = defaultdict(list)
    for colType, collections in rawCollectionTypes.items(): 
        
        if verbose:
            print("\n" + "-" * 48)
            print("-" * 15, colType.upper(), "-" * 15)
            print("-" * 48)
            
        for collection_name in collections.values():
            collectionTypes[colType].append(collection_name)
            
            if verbose:
                print('-->', collection_name)
                
        if verbose:
            print("-" * 48)
            print("-" * 48 + "\n")
            
    return collectionTypes, rawCollectionTypes   


def load_indacoData(db_type,visualize_aggregated_territories = True):

    # -------- Connecting to the database ---------------
    db = DbService(db_type)
    
    # Get the categories
    categories = db.get_productTypes()
    
    # Get the sellers
    sellers = db.get_sellers()
    
    # Get the warehouse
    warehouses = db.get_warehouses()
    
    # Get orders 
    orders = db.get_orders() 
     
    # Get the products
    indacoProducts_df = db.get_products(consider_delatedProducts = False)
    indacoProducts_df = indacoProducts_df[indacoProducts_df['channel'] == "62ed13d02477d328814c66ed"]
    # --------------------------------------------
    
    # Enhance product df
    if(db_type == "mongodb"):
        indacoProducts_df = mongodb_utils.enhanced_products_df(indacoProducts_df, categories, sellers, warehouses, orders)
    else:
        indacoProducts_df = db.enhanced_products_df(indacoProducts_df, categories, sellers, warehouses, orders)
    
    # Get user profiles    
    userProfiles = db.get_customerProfiles()
    
    # Visualize production_areas
    if visualize_aggregated_territories:
        aggregatedProductionAreas = indacoProducts_df[indacoProducts_df['production_areas'] != ''].copy()
        aggregatedProductionAreas.columns = list(map(str.upper,aggregatedProductionAreas.columns))
        aggregatedProductionAreas = aggregatedProductionAreas.groupby(
            by = ['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE'], 
            as_index = False).count()
        aggregatedProductionAreas = aggregatedProductionAreas[['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE', 'FREQUENCY']]
        aggregatedProductionAreas.rename(columns = {'FREQUENCY': 'ITEMS'},inplace = True)
        aggregatedProductionAreas = aggregatedProductionAreas.sort_values(
            by = ['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE'],
            ascending = True) 
        
        print("\n" + 70 * "-")
        print("-" * 20, f'Aggregated production areas', "-" * 21)
        print(70 * "-", "\n")
        
        print(61 * "-")
        print(61 * "-")
        print(tabulate(aggregatedProductionAreas, headers = aggregatedProductionAreas.columns, tablefmt = 'pretty'))
        print(61 * "-")
        print(61 * "-", "\n")
        
        total_items = aggregatedProductionAreas['ITEMS'].sum()
        print('\t\t' + 30 * "-")
        print('\t\t' + 30 * "-")
        print(f"\t\tTOTAL ITEMS: {total_items}/{len(indacoProducts_df)} "\
            f"({np.round((total_items/len(indacoProducts_df))*100, 2)} %)")
        print('\t\t' + 30 * "-")
        print('\t\t' + 30 * "-", "\n")
        
        withoutArea = indacoProducts_df.loc[indacoProducts_df['production_areas'] == '', 'Title'].unique()
        print(f"ITEMS WITHOUT A PRODUCTION AREA ({len(withoutArea)}):", "\n" + "-" * 50)
        print('\n'.join(withoutArea))
    
    # Add the category for each product in the orders
    findProdType =  lambda df, indaco_sku: df.loc[df['indaco_sku'].str.upper() == indaco_sku.upper(), 'Product Type']
    orders.insert(loc = 2, column = 'Product Type', value = orders['sku'].apply(
        lambda sku: findProdType(indacoProducts_df, sku).unique()))
        
    # Dealing with ordered products that do not exists any more
    orders['Product Type'] = orders['Product Type'].apply(lambda category: category[0] if len(category) != 0 else np.nan)
    orders = orders.dropna(subset = ['Product Type'])
    
    return db,indacoProducts_df, orders, categories, userProfiles