| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/launcher_utils.py |
import json
from datetime import date
from os import path
from string import ascii_uppercase
import numpy as np
from pandas import DataFrame, ExcelWriter, concat
from tabulate import tabulate
from collections import defaultdict
from _library.data_utils import io_toolkit
from _library.recom_utils.collectionBased_utils import (
build_bundles, minorChanges_recomAttributes)
from _library.toolkit import jaccard_similarity
def visualize_intro(app_settings):
print("\n" + 120 * "-")
print(120 * "-")
print("-" * 43, f"Hybrid recommender system (v{app_settings['app_version']})", "-" * 43)
print(120 * "-")
print(120 * "-" + "\n" )
print("\n" + 80 * "-")
print(80 * "-")
print("-" * 17, f"The setting file has been read ({len(app_settings.keys())} params)", "-" * 20)
print(80 * "-")
print(80 * "-" + "\n" )
for idk, (key, value) in enumerate(app_settings.items()):
info = type(value).__name__
if isinstance(value, dict):
info += ':' + str(len(value.keys()))
value = json.dumps(value, indent = 8)
elif isinstance(value, list):
info += ':' + str(len(value))
value = ' | '.join(value)
print('-' * 90)
print(f" |{ascii_uppercase[idk]}| {key.upper()} ({info}) --> {value}")
print('-' * 90)
def generate_rsEngine_names(app_settings):
# Generate the names
rs_engines = app_settings['rs_priority']
if app_settings['add_categoryBased_variants']:
# Retrieve the methods available
methods_with_catVersion = ["enhanced_assRules", "collectionBased", "simple_assRules"]
methods_with_catVersion = [item for item in methods_with_catVersion if item in rs_engines]
if len(methods_with_catVersion) > 0:
# Generate the names for the categoryBased versions
catVersion_names = [engine_name + '_cat' for engine_name in rs_engines
if engine_name in methods_with_catVersion]
# Insert the new names
entryPoint = np.max([rs_engines.index(item) for item in methods_with_catVersion]) + 1
for idk, engineName_variant in enumerate(catVersion_names):
rs_engines.insert(entryPoint + idk, engineName_variant)
# Visualize the engines
print("\n" + "-" * 30)
print(f"Recommender engines ({len(rs_engines)}):")
print("-" * 30)
for idk, engine_name in enumerate(rs_engines):
print(f"--> ({idk + 1}) {engine_name}")
print("-" * 30, "\n")
return rs_engines
def visualize_recommendations(recommendations):
for idk, (item_name, recommendations) in enumerate(recommendations.items()):
print("\n" + "-" * 60)
print(f"({idk + 1}) REFERENCE PRODUCT: {item_name}")
print(f"--> Recommendations: {len(recommendations)}\n")
print( "-" * 60)
if len(recommendations) == 0:
print("There are no items that can be recommended! Sorry\n")
else:
for recommendation in recommendations:
for attribute, value in recommendation.items():
if isinstance(value, list) or isinstance(value, set):
value = ' | '.join(value)
if isinstance(value, str):
value = value.upper()
print(f"--> {attribute} [{type(value)}] --> {value}")
print("-" * 60)
def merge_order_dfs(indacoOrders_df, inTrentinoOrders_df, sku_mapping):
# Select the necessary columns
selected_columns = ['Transaction id', 'SKU', 'Order Month', 'Title', 'Product Type', 'Quantity']
orders_to_attach = inTrentinoOrders_df[selected_columns].dropna(subset = ['SKU']).copy()
orders_to_attach = orders_to_attach.reset_index(drop = True)
# Minor changes
orders_to_attach['SKU'] = orders_to_attach['SKU'].str.upper()
orders_to_attach['customer_id'] = -1
#orders_to_attach['Order Month'] = orders_to_attach['Order Month'].apply(lambda ts: ts.to_timestamp())
orders_to_attach.rename(
columns = {
'Title': 'product_name',
'Quantity': 'quantity',
'Order Month' : 'timestamp'},
inplace = True)
orders_to_attach['indaco_sku'] = orders_to_attach['SKU'].apply(
lambda simplified_sku: sku_mapping[simplified_sku] if simplified_sku in sku_mapping.keys() else "")
# Avoid overlaps among the transaction ids
orders_to_attach['Transaction id'] += 1
orders_to_attach['Transaction id'] *= -1
# Merge them
merged_df = concat([indacoOrders_df, orders_to_attach, ], axis = 0)
merged_df = merged_df.sort_values(by = ['Transaction id'], ascending = True).reset_index(drop = True)
return merged_df
def rearrange_recommendations(recommendations_byProduct, platfrom_products, app_settings, rs_priorities, user_profile, product_identifier):
print("\n" + 90 * "-")
print("-" * 27,"Sort and re-arrange recommendations", "-" * 26)
print(90 * "-")
print(f"[ORDERS] Recommender engines ({len(rs_priorities)}):", "\n" + "-" * 30)
for idk, engine_name in enumerate(rs_priorities):
print(f"--> ({idk + 1}) {engine_name}")
print("-" * 30, "\n")
# Sorting function
userBased = len(user_profile.keys()) > 0
# --> (A) Profile-based
if userBased:
print("\nSORTING MODE: User-based\n")
normalizeNames = lambda items: list(map(str.lower, items))
if 'brands' not in user_profile.keys():
user_profile['brands'] = []
# Low is better (ascending)
sortingProfileBased_func = lambda recom : (
rs_priorities.index(recom['rs_source']),
recom['item_type'].lower() not in normalizeNames(user_profile['categories']), # PREFERRED VALUE: false (0)
recom['item_vendor'].lower() not in normalizeNames(user_profile['brands']), # PREFERRED VALUE: false (0)
recom['item_sku'].lower() in normalizeNames(user_profile['unique_products'])) # PREFERRED VALUE: false (0)
#recom['###'].lower() in normalizeNames(user_profile['bio_percentage'] if user_profile['bio_percentage'] is not -1)
# aggiungere magari bio
# --> (B) Generic approach: (1) rs method (2) type dissimilarity and (3) name dissimilarity
else:
print("\nSORTING MODE: Generic sorting\n")
# Low is better (ascending)
genericSorting = lambda recom, referenceName, referenceType: (
rs_priorities.index(recom['rs_source']),
1 - jaccard_similarity(recom['item_type'], referenceType),
jaccard_similarity(recom['item_name'], referenceName))
for product_sku in recommendations_byProduct.keys():
# Find the product type
cond = platfrom_products[product_identifier] == product_sku
product_type = platfrom_products.loc[cond, 'Product Type'].values[0]
product_name = platfrom_products.loc[cond, 'Title'].values[0]
# Set the sorting function according to the modality: Generic or user-based
sorting_func = lambda recom: sortingProfileBased_func(recom) if userBased else genericSorting(recom, product_name, product_type)
# Pre-sort recommendations (according to 'rs_priority' and 'rs_source')
recommendations_byProduct[product_sku] = sorted(recommendations_byProduct[product_sku], key = sorting_func)
#print(f"[{product_sku}] ORIGINAL ({len(recommendations_byProduct[product_sku])}):", json.dumps(recommendations_byProduct[product_sku], indent = 4))
# Filter recommended items: (a) drop similar products || (b) self.drop_similar_categories
rs_methods = [recom['rs_source'] for recom in recommendations_byProduct[product_sku]]
if len(recommendations_byProduct[product_sku]) > 4 and 'randomProducts' not in rs_methods:
bundles = build_bundles(
reference_products = None,
recommendations_byCollectionType = {'all_items': recommendations_byProduct[product_sku]},
products_df = platfrom_products,
product_identifier = product_identifier,
drop_similar_categories = app_settings['drop_similar_categories'],
merge_collection_type = False,
output_recom_with_one_linktype = app_settings['output_recom_with_one_linktype'],
verbose = False)
recommendations_byProduct[product_sku] = bundles['all_items']
else:
# Drop duplicate items
recommendations = []
for recom in recommendations_byProduct[product_sku]:
previous_items = [item['item_sku'] for item in recommendations]
if recom['item_sku'] not in previous_items:
recommendations.append(recom)
recommendations_byProduct[product_sku] = recommendations
# Sort recommendations after bundles generation
recommendations_byProduct[product_sku] = sorted(recommendations_byProduct[product_sku], key = sorting_func)
#print(f"AFTER BUILD ({len(recommendations_byProduct[product_sku])}):", json.dumps(recommendations_byProduct[product_sku], indent = 4))
# Minor changes in the attributes of the recommendations
attribute_order = ['rank', 'item_sku', "indaco_sku", 'item_name', 'item_frequency', 'item_type', 'item_vendor',
"similar_products", 'linked_production_areas', 'linked_regions', 'linked_recipes',
'linked_experiences','explaination', "seller", 'inTrentino_source', "merged_source", 'rs_source']
recommendations_byProduct[product_sku] = minorChanges_recomAttributes(
recommendations = recommendations_byProduct[product_sku],
attribute_order = attribute_order)
#print(f"MINOR CHANGES ({len(recommendations_byProduct[product_sku])}):", json.dumps(recommendations_byProduct[product_sku], indent = 4))
if app_settings['max_recommendations'] != -1:
partial_recommendations = recommendations_byProduct[product_sku][:app_settings['max_recommendations']]
recommendations_byProduct[product_sku] = partial_recommendations
#print(f"OUTPUT ({len(recommendations_byProduct[product_sku])}):", json.dumps(recommendations_byProduct[product_sku], indent = 4))
if len(recommendations_byProduct[product_sku]) < 4:
print("SMALL BUNDLE:", product_sku)
#raise Exception()
return recommendations_byProduct
def compute_methodsCoverage(recommendations_byProduct, info_recommendations):
info_df = DataFrame(index = ['Overview'] + list(info_recommendations.keys()),
columns = ['Duration', 'Item coverage (percentage)',
'Item coverage', '[AVG] recommended items for product'])
# [Overview] Compute general items coverage
items_coverage = np.array(list(map(len, recommendations_byProduct.values())))
total_items = len(recommendations_byProduct.keys())
items_covered = np.nonzero(items_coverage)[0].size
items_covered_percentage = round((items_covered / len(items_coverage)) * 100, 1)
total_duration = np.sum([info['duration'] for info in info_recommendations.values() if 'duration' in info.keys()])
total_duration = np.timedelta64(total_duration, 's' if total_duration.astype(int) <= 60 else 'm')
info_df.loc['Overview', 'Duration'] = total_duration
info_df.loc['Overview', 'Item coverage'] = items_covered
info_df.loc['Overview', 'Item coverage (percentage)'] = items_covered_percentage
info_df.loc['Overview', '[AVG] recommended items for product'] = np.round(np.mean(items_coverage), 0)
# [Overview] Visualize general items coverage
print("\n" + 90 * "-")
print("-" * 24,f"[Overview] ITEM COVERAGE: {items_covered}/{len(items_coverage)} "\
f"({items_covered_percentage} %)", "-" * 24)
print('-' * 35, "DURATION: ~", total_duration, '-' * 35)
print(90 * "-")
# [Details] Visualize items coverage for each RS method
coveredItems_counter = set()
for idk, (method_name, info) in enumerate(info_recommendations.items()):
# The RS method has been skipped
if len(info.keys()) == 0:
continue
# Compute the method coverage
products_covered = info['products']
coveredItems_counter.update(products_covered)
products_covered_percentage = round((len(products_covered)/total_items) * 100, 1)
products_covered_comulativePercentage = round((len(coveredItems_counter) / total_items) * 100, 1)
# Add the coverage information
info_df.loc[method_name, 'Duration'] = info['duration']
info_df.loc[method_name, 'Item coverage'] = len(products_covered)
info_df.loc[method_name, 'Item coverage (percentage)'] = products_covered_percentage
# Add the cumulative coverage
info_df.loc[method_name, 'Item coverage (cumulative)'] = len(coveredItems_counter)
info_df.loc[method_name, 'Item coverage (cumulative percentage)'] = products_covered_comulativePercentage
info_df = info_df.reindex(
columns = [
'[AVG] recommended items for product', 'Duration',
'Item coverage', 'Item coverage (cumulative)',
'Item coverage (percentage)', 'Item coverage (cumulative percentage)']
)
# Visualize the outcomes
print("\n" + 60 * "-")
print(f"RS METHOD {idk + 1}: {method_name} --> item coverage: "\
f"{products_covered_percentage} % ({len(products_covered)})")
print("\t\t\tDURATION:", info['duration'])
print(60 * "-")
if len(products_covered) > 0:
print("-->", "\n--> ".join(products_covered[:5]))
if len(products_covered) > 5:
print(f"--> ...")
print(40 * "-")
print(60 * "-" + "\n")
return info_df
def generate_params_df(app_settings):
app_settings_stringfy = dict()
for param_name, param_value in app_settings.items():
if isinstance(param_value, list):
app_settings_stringfy[param_name] = ', '.join(param_value)
elif isinstance(param_value, dict):
for sub_name, sub_value in param_value.items():
app_settings_stringfy[param_name + ":" + sub_name] = sub_value
else:
app_settings_stringfy[param_name] = param_value
app_setting_df = DataFrame.from_dict(app_settings_stringfy, orient = 'index', columns = ['Value'])
return app_setting_df
def visualize_orders(orders_df, last_k_orders = -1):
print("\n" + 100 * "-")
print("-" * 38, f"Transaction ({len(orders_df['Transaction id'].unique())})", "-" * 37)
print(100 * "-", "\n")
if last_k_orders != -1:
transaction_ids = sorted(orders_df['Transaction id'].unique(), reverse = True)
orders_df = orders_df[orders_df['Transaction id'].isin(transaction_ids[:last_k_orders])]
print("\n" + 150 * "-")
print("-" * 67, f"LAST {last_k_orders} orders", "-" * 68)
print(150 * "-")
# Minor changes
orders_df = orders_df.drop(columns = ['indaco_sku'])
orders_df = orders_df.sort_values(by = ['Transaction id', 'Product Type' ,'product_name', 'quantity'],
ascending = [True, True, True, False]).reset_index(drop = True)
print(tabulate(orders_df, headers = orders_df.columns, tablefmt = "pretty"))
def visualize_products(products_df, all_columns = False):
if all_columns:
cols_to_visualize = products_df.columns
else:
cols_to_visualize = ['Title', 'SKU', 'Product Type', 'Vendor', 'Seller',
'production_areas','inTrentino_source', 'Frequency']
print("\n" + 120 * "-")
print("-" * 52, "INDACO products", "-" * 51)
print(120 * "-", "\n")
df_to_visulize = products_df[cols_to_visualize].sort_values(by = ['Title', "Product Type"]).reset_index(drop = True)
print(f"ALL COLUMNS ({len(products_df.columns)}):", ' | '.join(products_df.columns), "\n")
print(tabulate(df_to_visulize, headers = cols_to_visualize, tablefmt = "pretty"))
print("\n")
def visualize_user_profile(user_id, user_profile):
print("\n" + "-" * 140 + "\n" + "-" * 64, f'USER ID: {user_id}', "-" * 63 + "\n" + "-" * 140, "\n")
chart_divider = ' | '
for attribute_name, attribute_values in user_profile.items():
# CASE A: list/set
if isinstance(attribute_values, set) or isinstance(attribute_values, list):
print(attribute_name.upper(), f"({len(attribute_values)}):", chart_divider.join(sorted(map(str, attribute_values))))
# CASE B: dictionary of lists
elif isinstance(attribute_values, defaultdict):
print(attribute_name.upper(), f"({len(attribute_values)}):\n\t",
'\n\t '.join([f't{key} ({len(values)} items) --> ' + chart_divider.join(sorted(values)) for key, values in attribute_values.items()]))
# GENERAL CASE
else:
print(str(attribute_name).upper() + ":", attribute_values)
print("-" * 80)
return user_profile
def save_recommendations(recommendations_byProduct, info_df, app_settings,
base_saving_folder = 'recommendations',
user_id = None,
base_file_name = 'raccomandazioni_inTrentino'):
# Generate the dataframe for the application setting
app_setting_df = generate_params_df(app_settings)
# Add version
saving_folder = path.join(base_saving_folder, f"v{app_settings['app_version']}")
if user_id:
subfolder = f'user_{user_id}' if user_id != -1 else 'generic_user'
saving_folder = path.join(saving_folder, subfolder)
# [file name] Extra info
recom_file_name = base_file_name
recom_file_name += '_' + date.today().strftime("%b%Y").lower()
if app_settings['drop_similar_categories']:
recom_file_name += '_tipiProdottiAggregati'
# [file name] Extension
recom_file_name += '.json'
# Save the recommendations for each products as a JSON file
io_toolkit.save_recommendations(recommendations_byProduct, saving_folder, recom_file_name)
# Save the further information
extra_info_file_name = 'info'
if app_settings['drop_similar_categories']:
extra_info_file_name += '_tipiProdottiAggregati'
extra_info_file_name += '.xlsx'
with ExcelWriter(path.join(saving_folder, extra_info_file_name), mode = 'w+') as excelWriter:
info_df.to_excel(excelWriter, sheet_name = 'Execution info')
app_setting_df.to_excel(excelWriter, sheet_name = 'Appplication params')
return saving_folder