| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/recom_utils/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/recom_utils/assRules_utils.py |
import numpy as np
from _library.toolkit import buildUp_explaination
def preProcessing_coPurchases(association_rules_df, name_mapping = None):
delimiter = '|'
if name_mapping:
name_mapping = {colName: colName.replace(originalcolName, newColName)
for originalcolName, newColName in name_mapping.items()
for colName in association_rules_df.columns
if originalcolName.lower() in colName.lower()}
if len(name_mapping.keys()) > 0:
association_rules_df.rename(columns = name_mapping, inplace = True)
for col_name in association_rules_df.select_dtypes(exclude = np.number).columns:
# Split the string into items
association_rules_df[col_name] = association_rules_df[col_name].str.split(delimiter)
# Remove potential whitespaces
remove_whiteSpace = lambda item_list: [item.strip() for item in item_list] if isinstance(item_list, list) else item_list
association_rules_df[col_name] = association_rules_df[col_name].apply(remove_whiteSpace)
# Fill NaN values with empty list
fill_nan = lambda item_list: list() if isinstance(item_list, float) else item_list
association_rules_df[col_name] = association_rules_df[col_name].apply(fill_nan)
return association_rules_df
def find_recommendations(association_rules, products_df, product_identifier, reference_items,
excluded_products = [], excluded_link_types = [],
force_perfect_match = False, drop_scores = True, filter_source_platform = True, verbose = True):
# Normalize list of items by capitalizing all items
normalize_string = lambda item_list: list(map(str.upper, item_list))
reference_item_names = normalize_string([item['item_name'] for item in reference_items])
if len(association_rules) == 0:
print(f"[{', '.join(reference_item_names)}] ERROR! No association rules have been found!")
return list()
association_rules['antecedents'] = association_rules['antecedents'].map(normalize_string)
# PART A: Filter the association rules concerning this reference items
intersect_cond = lambda item_list: np.intersect1d(item_list, reference_item_names)
check_intersect_cond = lambda item_list: True if len(intersect_cond(item_list)) >= len(reference_item_names) else False
# Find the minimum number of matching items
test_cond = association_rules['antecedents'].apply(check_intersect_cond)
min_matching_items = association_rules.loc[test_cond, 'antecedents'].apply(len).min()
smallest_matching_basket_cond = lambda item_list: len(item_list) == min_matching_items
# Define the final filtering condition
match_cond = lambda item_list: check_intersect_cond(item_list) and smallest_matching_basket_cond(item_list)
# Force the matching baskets to include only the refererence items
if force_perfect_match:
same_item_cond = lambda item_list: np.array_equal(sorted(item_list), sorted(reference_item_names))
match_cond = lambda item_list: check_intersect_cond(item_list) \
and smallest_matching_basket_cond(item_list) \
and same_item_cond(item_list)
# Filtering the association rules
filtering_cond = association_rules['antecedents'].apply(match_cond)
basic_item_ass_rules_df = association_rules[filtering_cond]
# Generate rows for each consequent item (A --> B & C) --> (A --> B | A --> C)
item_ass_rules_df = basic_item_ass_rules_df.explode('consequents').reset_index(drop = True)
item_ass_rules_df = item_ass_rules_df.drop_duplicates(subset = 'consequents')
# Transform the dataset (i.e., rename)
recommendations = item_ass_rules_df.rename(columns = {'consequents': 'item_name'})
recommendations['item_name'] = recommendations['item_name'].str.upper()
# Remove excluded products
cond = ~recommendations['item_name'].isin(excluded_products)
recommendations = recommendations[cond]
assRule_collectionNames = {assRule_col: assRule_col.split('-')[1].strip()
for assRule_col in recommendations.columns
if 'common' in assRule_col.lower()}
# Remove items of excluded link types
if len(assRule_collectionNames.keys()) > 0:
for colName in assRule_collectionNames.keys():
recommendations[colName] = recommendations[colName].apply(
lambda items: list(map(str.capitalize, items)))
if len(excluded_link_types) > 0:
excluded_col_link = [full_colName for excluded_col in excluded_link_types
for full_colName, collection_name in assRule_collectionNames.items()
if excluded_col.lower() in collection_name.lower().split(' ')]
remaining_links = np.setdiff1d(list(assRule_collectionNames.keys()), excluded_col_link)
# Drop columns concerning excluded links
recommendations = recommendations.drop(columns = excluded_col_link)
# Keep observations having linked collection in the remaining linked collection columns
dim_linkedCollections = recommendations[remaining_links].applymap(len) > 0
cond = dim_linkedCollections.any(axis = 1)
recommendations = recommendations[cond]
dropped_items = len(cond) - len(cond[cond == True])
if dropped_items > 0:
print(f"\nDropping {dropped_items} items concerning the excluded link "\
f"(i.e., {', '.join(excluded_col_link)})\n")
# Retrieve further information about the recommended products
#linkedCollection_colNames = ['production_areas']
#products_df = products_df.drop(columns = linkedCollection_colNames)
products_df[product_identifier] = products_df[product_identifier].str.upper()
recommendations = recommendations.merge(products_df, how = 'left', left_on = 'item_name', right_on = product_identifier)
recommendations = recommendations.rename(
columns = {
'Title': 'product_name',
'Product Type': 'item_type',
'Vendor': 'item_vendor',
'SKU': 'item_sku',
'production_areas' : 'linked_production_area'}
)
# Select only information infer from one product in case the identifier used is the product type
recommendations = recommendations.sort_values(by = 'Frequency', ascending = False)
recommendations = recommendations.drop_duplicates(subset = 'item_name').reset_index(drop = True)
# Drop unnecessary columns
if len(recommendations) > 0:
cols_to_delete = ['Type id', 'Frequency', "indaco_sku", 'refrigerated', 'weight [grams]', 'warehouses']
if product_identifier == 'Title':
cols_to_delete.append('product_name')
elif product_identifier == 'Product Type':
cols_to_delete.extend(['item_sku', 'product_name', 'item_vendor', 'inTrentino_source', 'linked_production_area'])
actual_cols_to_delate = [col for col in recommendations.columns if col in cols_to_delete]
recommendations = recommendations.drop(columns = actual_cols_to_delate)
if filter_source_platform and product_identifier != 'Product Type':
reference_sourcePlatforms = np.unique([item['inTrentino_source'] for item in reference_items])
# Filter recommended items if there is a unique source platform among the reference products
if len(reference_sourcePlatforms) == 1:
reference_sourcePlatform = reference_sourcePlatforms[0]
original_recommendation = recommendations.copy()
cond = recommendations['inTrentino_source'] == reference_sourcePlatform
recommendations = recommendations[cond]
unknown_products = original_recommendation[original_recommendation['inTrentino_source'].isna()]
if len(unknown_products) > 0:
print(f"\n\t[INFO] {len(unknown_products)} products have not been found. So sad :/")
print('\t-->', '\n\t--> '.join(unknown_products['item_name'].values))
drop_items = original_recommendation.drop(index = recommendations.index)
if len(drop_items) - len(unknown_products) > 0:
print(f"\n\t[INFO] Dropped {len(original_recommendation) - len(recommendations)} "\
"associated items which come from the other platform.")
print('\t-->', '\n\t--> '.join(drop_items['item_name'].values))
# Sort dataframe
recommendations = recommendations.sort_values(by = ['confidence', 'lift', 'support'], ascending = False).reset_index(drop = True)
# Create the metadata column
recommendations.insert(0, 'rank', recommendations.index + 1)
# Explainations statements
linked_collection_types = {
'Common - production_areas' : ' e sono tipici del territorio',
}
# 'linked_production_area': ' e sono tipici del territorio'
recommendations['explaination'] = recommendations.apply(
func = lambda df_row: buildUp_explaination(linked_collection_types, df_row),
axis = 1)
unnecessary_columns = [col_name for col_name in recommendations.columns
if ('antecedent' in col_name.lower()) or ('consequent' in col_name.lower())]
if drop_scores:
unnecessary_columns.extend(['confidence', 'support', 'lift'])
recommendations.drop(columns = unnecessary_columns, inplace = True)
# Remove artefacts from the column names
cleaned_columns = []
for col_name in recommendations.columns:
if col_name == 'inTrentino_source':
new_colName = col_name
elif 'common' in col_name.lower():
if 'linked' in col_name.lower():
new_colName = col_name.replace('Common - ', '').replace(' ', '_').lower()
else:
new_colName = col_name.replace('Common -', 'Linked').replace(' ', '_').lower() #
else:
new_colName = col_name.lower()
cleaned_columns.append(new_colName)
recommendations.columns = cleaned_columns
# Visualize outcome
item_stringfy = [item['item_name'] + (' [by ' + item['vendor'].upper()+ ']'
if isinstance(item['vendor'], str) else ' ')
for item in reference_items]
if len(recommendations) > 0:
print("\n" + "-" * 110)
print(f"[Assrules] REFERENCE ITEMS ({len(reference_items)}):", ' | '.join(item_stringfy), "-->",
len(recommendations), "association rules")
if not force_perfect_match and len(basic_item_ass_rules_df) > 0:
print(f"Min matching antecedents items: {min_matching_items} "\
f"(e.g., {' | '.join(basic_item_ass_rules_df.iloc[0].loc['antecedents'])})")
print("-" * 110)
if verbose:
print(recommendations)
else:
print("-->", '\n--> '.join(recommendations.apply(
func = lambda df_row: f"({int(df_row.name) + 1}) [{df_row['item_type']}] {df_row['item_name']} " +
(f"--> inTrentino: {str(df_row['inTrentino_source']).upper()}" if 'inTrentino_source' in df_row.index else ''),
axis = 1)))
# Return a dictionary
recommendations = recommendations.to_dict(orient = 'records')
# Remove unnecesary attributes
for idk, recommendation in enumerate(recommendations.copy()):
for key, attribute in recommendation.copy().items():
if isinstance(attribute, list) and (len(attribute) == 0):
recommendations[idk].pop(key)
return recommendations
def find_products_byCategory(all_products, item_type, unique_product_identifier, reference_vendors,
reference_sourcePlatforms, collection_items = None):
# Build up the conditions
# Condition (1): Type-based condition
typeBased_cond = all_products['Product Type'] == item_type
filtering_method = typeBased_cond
filtering_condition_names = ['type_based']
# Condition (2): Vendor-based condition
if reference_vendors != None:
vendorBased_cond = all_products['Vendor'].isin(reference_vendors)
filtering_method = typeBased_cond & vendorBased_cond
filtering_condition_names.append('vendor_based')
# Intermediate step (TEST: Cond 1 & cond 2)
linked_items = all_products.loc[filtering_method, :]
# In case the vendor has zero products for this category
# --> Retrieve products also from other vendors
if (len(linked_items) == 0) and (reference_vendors != None):
filtering_condition_names.remove('vendor_based')
filtering_method = typeBased_cond
# Condition (3): Item-based condition
if collection_items != None:
itemBased_cond = all_products['Title'].isin(collection_items)
filtering_method = filtering_method & itemBased_cond
filtering_condition_names.append('collectionItem_based')
# Condition (4): source platform condition
if reference_sourcePlatforms != None:
if len(reference_sourcePlatforms) == 1:
reference_sourcePlatform = reference_sourcePlatforms[0]
sourcePlatform_cond = all_products['inTrentino_source'] == reference_sourcePlatform
filtering_method = filtering_method & sourcePlatform_cond
filtering_condition_names.append('sourcePlatform_based')
# Filter the products
linked_items = all_products.loc[filtering_method, :]
# Sort the recommendations according to the historical data
linked_items = linked_items.sort_values(by = 'Frequency', ascending = False)
# Save the list of linked items
linked_items = linked_items.drop_duplicates(subset = ['Title', 'SKU'])
linked_items = linked_items.drop(columns = ['Type id', 'Frequency', 'Product Type'])
linked_items = linked_items.rename(columns = {
'Title': 'item_name',
'SKU': 'item_sku',
'Vendor': 'item_vendor',
'production_areas': 'linked_production_area'})
linked_items = linked_items.to_dict(orient = 'records')
return linked_items, filtering_condition_names
def addItems_byCategory(recommendations, reference_items, all_products, unique_product_identifier,
filter_source_platform = False, single_item = False):
reference_vendors = [item['vendor'] for item in reference_items]
if filter_source_platform:
reference_sourcePlatforms = np.unique([item['inTrentino_source'] for item in reference_items])
else:
reference_sourcePlatforms = None
for recommendation in recommendations.copy():
# Delate an artefact: The item name since the item is a category
if 'item_name' in recommendation.keys():
recommendation.pop('item_name')
if 'product_name' in recommendation.keys():
recommendation.pop('product_name')
linked_items, filtering_methods = find_products_byCategory(all_products, recommendation['item_type'],
unique_product_identifier, reference_vendors,
reference_sourcePlatforms)
if len(linked_items) == 0:
recommendations.remove(recommendation)
print(f"[INFO] Dropping '{recommendation['item_type']}' due to a lack of linked items.")
continue
if single_item:
linked_item = linked_items[0]
recommendation.update(linked_item)
else:
recommendation['category_items'] = linked_items
print("\n[INFO] Filtering approach: Products have been filtered according to:",
' | '.join(filtering_methods))
if single_item:
print('[INFO] The list of products have been shrinked to only one product')
return recommendations