| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/data_utils/data_loader.py |
import json
from collections import defaultdict
from os import path,walk
import numpy as np
import pandas as pd
from tabulate import tabulate
from _library.data_utils import mongodb_utils, remoteConnection_utils
from _library.toolkit import preProcessing_collectionColumns
from components.DbService import DbService
def load_orders(order_file_name = 'OLDinTrentino_' + 'Orders_Set20_Giu22_enhanced_anonymized.xlsx'):
orders_df = remoteConnection_utils.read_remote_dataframe(file_name = order_file_name)
return orders_df
def load_inTrentino_products_OLD(excluded_products):
product_identifier = "Title"
# Location
folder_path = path.join('indaco', 'inTrentino', 'data', 'ontology')
file_name = 'prodotti_inTrentino.xlsx'
# Load data
inTrentino_products = remoteConnection_utils.read_generic_remote_file(folder_path, file_name)
# Drop excluded products
for item in excluded_products:
cond = inTrentino_products[product_identifier].str.lower() == item.lower()
obs_to_exclude = inTrentino_products.loc[cond, product_identifier]
if len(obs_to_exclude) > 0:
print(f"EXCLUDED: {' | '.join(obs_to_exclude.unique())}")
inTrentino_products.drop(index = obs_to_exclude.index, inplace = True)
print("-" * 60, "\n")
if product_identifier in inTrentino_products.columns:
# Sort products
inTrentino_products = inTrentino_products.sort_values(by = product_identifier)
# Retrieve names
product_names = inTrentino_products[product_identifier].tolist()
else:
product_names = []
return inTrentino_products, product_names
def get_localFolderFiles(prefix = None):
file_names = dict()
# Build the basic path
dirname = path.dirname(__file__)
files_path = path.join(dirname, '../../../recomsys-datapreparation-batch/_tmp')
for (dir_path, dir_names, fileNames) in walk(files_path):
for file_name in fileNames:
print(file_name)
if prefix != None:
if file_name.startswith(prefix):
file_names[file_name] = dir_path+"/"+file_name
else:
file_names[file_name] = dir_path+"/"+file_name
print(f"Files ({len(file_names.keys())}):")
print('-' * 40, "\n")
return file_names
def read_generic_local_file(fileName):
file_extension = fileName.split(".")[-1]
dirname = path.dirname(__file__)
files_path = path.join(dirname, '../../../recomsys-datapreparation-batch/_tmp')
with open(files_path+"/"+fileName,"rb") as file:
if 'json' in file_extension:
print('--> JSON file')
loaded_file = json.load(file)
elif 'xls' in file_extension:
print('--> Excel file')
loaded_file = pd.read_excel(file, sheet_name = None)
if len(loaded_file.keys()) > 1:
print(f'--> Mutiple sheets ({len(loaded_file.keys())}):', ' | '.join(loaded_file.keys()))
else:
sheet_name = list(loaded_file.keys())[0]
loaded_file = loaded_file[sheet_name]
print("--> Single sheet:", sheet_name)
else:
print(file_extension, "is not supported!")
return None
print(f"--> OK: The file ({fileName}) has been loaded correctly.\n")
return loaded_file
def load_coPurchases(version_name = '', product_identifier = 'Title',load_remotely=True):
# Folder path
if(load_remotely):
#remote folder path
folder_path = path.join('indaco', 'inTrentino', 'data', 'co-purchases')
print("FOLDER_PATH",folder_path)
# Build up the file name
file_name = f"coPurchases_by" + product_identifier.replace(' ', '') #'_Set20_Giu22.xlsx"
if version_name or version_name != '':
file_name = version_name + '_' + file_name
# Find the actual names
if(load_remotely):
actual_fileNames = remoteConnection_utils.get_folderFiles(folder_path, prefix = file_name)
else:
print("Upload local")
actual_fileNames = get_localFolderFiles(prefix = file_name)
if len(actual_fileNames) == 0:
print(f"ISSUE: File not found! {file_name}...")
elif len(actual_fileNames) == 1:
actual_fileName = list(actual_fileNames.keys())[0]
else:
sorted_fileNames = list(dict(sorted(actual_fileNames.items(), key = lambda item: item[1], reverse = True)).keys())
actual_fileName = sorted_fileNames[0]
if(load_remotely):
# Read the multiple sheets
sheets = remoteConnection_utils.read_generic_remote_file(folder_path = folder_path, file_name = actual_fileName)
else:
sheets = read_generic_local_file(actual_fileName)
# Select only the relevant sheets
association_rules_df = sheets['Association rules']
enhanced_association_rules_df = sheets['Ass. rules with collections']
return association_rules_df, enhanced_association_rules_df
def generate_platformData(orders_df, col_names, inTrentino_flag = True, excluded_products = []):
transaction_identifier = "Transaction id"
product_identifier = "Title"
# Actual col names to select
actual_col_names = []
for col_name in col_names:
if col_name in list(orders_df.columns):
actual_col_names.append(col_name)
else:
print("-" * 40)
print(f'COLUMN "{col_name}" not found!')
print("-" * 40)
orders_df = orders_df.sort_values(by = 'Transaction id', ascending = False)
# Normalize strings
normalize_string = lambda string_name: string_name.capitalize() if isinstance(string_name, str) else string_name
simplified_orders = orders_df[actual_col_names].applymap(normalize_string)
# Retrieve the unique products
platfrom_products = simplified_orders.drop_duplicates().reset_index(drop = True)
# Compute their frequencies
num_baskets = len(orders_df[transaction_identifier].unique())
item_frequencies = simplified_orders[product_identifier].value_counts() / num_baskets
item_frequencies = item_frequencies.rename('Frequency').round(decimals = 4).to_frame()
# Add the inTrentino flag
platfrom_products['inTrentino_source'] = inTrentino_flag
# Merge the two views
platfrom_products = platfrom_products.merge(item_frequencies,
left_on = product_identifier,
right_index = True)
platfrom_products = platfrom_products.sort_values(by = product_identifier, ascending = True)
platfrom_products = platfrom_products.reset_index(drop = True)
for item in excluded_products:
cond = platfrom_products[product_identifier].str.lower() == item.lower()
obs_to_exclude = platfrom_products.loc[cond, product_identifier]
if len(obs_to_exclude) > 0:
print("-" * 40)
print(f"EXCLUDED: {' | '.join(obs_to_exclude.unique())}")
platfrom_products.drop(index = obs_to_exclude.index, inplace = True)
print("-" * 40, "\n")
# Delate duplicate items and artefacts
if 'SKU' in platfrom_products.columns:
platfrom_products = platfrom_products.dropna(axis = 0, subset = ['SKU'])
platfrom_products.reset_index(drop = True, inplace = True)
# Fille the empty links with empty lists
linked_cols = [col for col in platfrom_products.columns if 'linked' in col.lower()]
platfrom_products = preProcessing_collectionColumns(platfrom_products, linked_cols)
return platfrom_products
def load_groupedCollectionNames():
# Load the collection names
grouped_collections = remoteConnection_utils.read_generic_remote_file(
folder_path = path.join('indaco', 'inTrentino', 'data', 'ontology'),
file_name = "ProductCollections.json")
# Visualize the collection groups
print(f'Collection groups ({len(grouped_collections.keys())}):',
', '.join(grouped_collections.keys()))
return grouped_collections
def load_collectionTypes(filepath, verbose = True):
rawCollectionTypes = dict()
with open(filepath) as json_file:
rawCollectionTypes = json.load(json_file)
# Keep and extract the collection names
collectionTypes = defaultdict(list)
for colType, collections in rawCollectionTypes.items():
if verbose:
print("\n" + "-" * 48)
print("-" * 15, colType.upper(), "-" * 15)
print("-" * 48)
for collection_name in collections.values():
collectionTypes[colType].append(collection_name)
if verbose:
print('-->', collection_name)
if verbose:
print("-" * 48)
print("-" * 48 + "\n")
return collectionTypes, rawCollectionTypes
def load_indacoData(db_type,visualize_aggregated_territories = True):
# -------- Connecting to the database ---------------
db = DbService(db_type)
# Get the categories
categories = db.get_productTypes()
# Get the sellers
sellers = db.get_sellers()
# Get the warehouse
warehouses = db.get_warehouses()
# Get orders
orders = db.get_orders()
# Get the products
indacoProducts_df = db.get_products(consider_delatedProducts = False)
indacoProducts_df = indacoProducts_df[indacoProducts_df['channel'] == "62ed13d02477d328814c66ed"]
# --------------------------------------------
# Enhance product df
if(db_type == "mongodb"):
indacoProducts_df = mongodb_utils.enhanced_products_df(indacoProducts_df, categories, sellers, warehouses, orders)
else:
indacoProducts_df = db.enhanced_products_df(indacoProducts_df, categories, sellers, warehouses, orders)
# Get user profiles
userProfiles = db.get_customerProfiles()
# Visualize production_areas
if visualize_aggregated_territories:
aggregatedProductionAreas = indacoProducts_df[indacoProducts_df['production_areas'] != ''].copy()
aggregatedProductionAreas.columns = list(map(str.upper,aggregatedProductionAreas.columns))
aggregatedProductionAreas = aggregatedProductionAreas.groupby(
by = ['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE'],
as_index = False).count()
aggregatedProductionAreas = aggregatedProductionAreas[['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE', 'FREQUENCY']]
aggregatedProductionAreas.rename(columns = {'FREQUENCY': 'ITEMS'},inplace = True)
aggregatedProductionAreas = aggregatedProductionAreas.sort_values(
by = ['PRODUCTION_AREAS','VENDOR','PRODUCT TYPE'],
ascending = True)
print("\n" + 70 * "-")
print("-" * 20, f'Aggregated production areas', "-" * 21)
print(70 * "-", "\n")
print(61 * "-")
print(61 * "-")
print(tabulate(aggregatedProductionAreas, headers = aggregatedProductionAreas.columns, tablefmt = 'pretty'))
print(61 * "-")
print(61 * "-", "\n")
total_items = aggregatedProductionAreas['ITEMS'].sum()
print('\t\t' + 30 * "-")
print('\t\t' + 30 * "-")
print(f"\t\tTOTAL ITEMS: {total_items}/{len(indacoProducts_df)} "\
f"({np.round((total_items/len(indacoProducts_df))*100, 2)} %)")
print('\t\t' + 30 * "-")
print('\t\t' + 30 * "-", "\n")
withoutArea = indacoProducts_df.loc[indacoProducts_df['production_areas'] == '', 'Title'].unique()
print(f"ITEMS WITHOUT A PRODUCTION AREA ({len(withoutArea)}):", "\n" + "-" * 50)
print('\n'.join(withoutArea))
# Add the category for each product in the orders
findProdType = lambda df, indaco_sku: df.loc[df['indaco_sku'].str.upper() == indaco_sku.upper(), 'Product Type']
orders.insert(loc = 2, column = 'Product Type', value = orders['sku'].apply(
lambda sku: findProdType(indacoProducts_df, sku).unique()))
# Dealing with ordered products that do not exists any more
orders['Product Type'] = orders['Product Type'].apply(lambda category: category[0] if len(category) != 0 else np.nan)
orders = orders.dropna(subset = ['Product Type'])
return db,indacoProducts_df, orders, categories, userProfiles