| Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/_library/ |
| Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/_library/toolkit.py |
import numpy as np
import pandas as pd
# sys.path.append(join(abspath(__file__), '..', '..'))
# ---------------------------
# STRUCTURE: {
# 'vendor': '',
# 'item_name': '',
# 'item_type': '',
# 'type_id': ,
# 'Frequency': ,
# 'inTrentino_source': }
# ----------------------------
def extract_referenceProduct(product, product_identifier):
# Normalize the title
product['Title'] = product['Title'].capitalize()
if product_identifier in product.index:
product['item_name'] = product[product_identifier]
#if product_identifier == 'Product Type':
# product.drop(index = ['SKU', 'Title', 'Frequency'] , inplace = True)
#else:
product.drop(index = product_identifier, inplace = True)
else:
print(f"What am I supposed to to? --> product_identifier: {product_identifier}")
# Rename the columns
product = product.rename(index = {'Product Type': 'item_type', 'Title': 'product_name'})
product.index = [col.replace(' ','_') for col in product.index]
product.index = [col.lower() if col != 'inTrentino_source' else col
for col in product.index]
# Turn the item into a dictionary
reference_items = product.to_dict()
return reference_items
def buildUp_explaination(linked_collection_types, recommendation, evidence_coPurchases = True):
# Base string
explaination_string = 'Prodotti che'
if evidence_coPurchases:
explaination_string += ' sono tipicamente acquistati insieme'
for collection_type, type_explaination in linked_collection_types.items():
actual_key = [key for key in recommendation.keys() if key in collection_type]
if len(actual_key) > 0:
actual_key = actual_key[0]
else:
continue
linked_collections = recommendation[actual_key]
if(pd.isnull(recommendation[actual_key])):
continue
# Generate a string that includes the collection references
if len(linked_collections) > 0:
explaination_string += type_explaination
if isinstance(linked_collections, list):
collection_names = ', '.join([collection.upper() for collection in linked_collections])
else:
collection_names = linked_collections
explaination_string += ' ' + collection_names
return explaination_string
def add_recommendationSource(recommendations, source_name):
if len(recommendations) == 0:
return recommendations
for recomendation in recommendations:
recomendation['rs_source'] = source_name
return recommendations
def jaccard_similarity(stringA, stringB, verbose = False):
words_A = np.array(stringA.split(' '))
words_B = np.array(stringB.split(' '))
#Find intersection of two sets
intersection = np.intersect1d(words_A, words_B)
#Find union of two sets
union = np.union1d(words_A, words_B)
#Take the ratio of sizes
similarity = len(intersection)/len(union)
if verbose:
print("\nString A:", words_A)
print("String B:", words_B)
print("Intersection:", intersection)
print("Union", union)
print("Jaccard similarity", similarity)
return similarity
def extract_baskets_of_orders(orders_df, product_identifier):
# Pre-processing
products_df = orders_df.dropna(subset = [product_identifier])
products_df = products_df[[product_identifier, 'Transaction id']]
products_df[product_identifier] = products_df[product_identifier].str.upper()
# Group the single ordered items according to the transaction
grouped_products_df = products_df.groupby(by = 'Transaction id', as_index = False).agg(lambda x: set(x))
# Group and count the unique baskets of orders
baskets_df = grouped_products_df[product_identifier].map('|'.join).value_counts()
baskets_df = baskets_df.rename('Frequency').to_frame()
baskets_df = baskets_df.reset_index(drop = False).rename(columns = {'index': 'Items'})
baskets_df['Items'] = baskets_df['Items'].str.split('|')
baskets_df['Basket dimension'] = baskets_df['Items'].apply(len)
baskets_df.sort_values(by = ['Frequency', 'Basket dimension'], inplace = True, ascending = False)
baskets_df = baskets_df.reset_index(drop = True)
return baskets_df
def generate_fake_products(products_df):
# Generate fake products
products_names = ['Pomodori marziani', 'Insalata marziana']
fake_products = []
for idk, product_name in enumerate(products_names):
fake_product = {}
if 'Vendor' in products_df.columns:
fake_product['Vendor'] = 'Mars S.p.A.'
if 'SKU' in products_df.columns:
fake_product['SKU'] = f'Mrsverd-0{idk +1}'
if 'Title' in products_df.columns:
fake_product['Title'] = product_name
if 'Product Type' in products_df.columns:
fake_product['Product Type'] = 'Frutta e verdura'
if 'Type id' in products_df.columns:
fake_product['Type id'] = 430
if 'inTrentino_source' in products_df.columns:
fake_product['inTrentino_source'] = False
if 'Frequency' in products_df.columns:
fake_product['Frequency'] = np.random.default_rng().uniform(
low = products_df['Frequency'].min(),
high = products_df['Frequency'].max())
fake_products.append(fake_product)
# Simulate new products
products_names = ['Pomodori terrestri', 'Insalata terrestri']
for idk, product_name in enumerate(products_names):
fake_product = {}
if 'Vendor' in products_df.columns:
fake_product['Vendor'] = 'Mars S.p.A.'
if 'SKU' in products_df.columns:
fake_product['SKU'] = f'Mrsearth-0{idk +1}'
if 'Title' in products_df.columns:
fake_product['Title'] = product_name
if 'Product Type' in products_df.columns:
fake_product['Product Type'] = 'Prodotti terrestri'
if 'Type id' in products_df.columns:
fake_product['Type id'] = 0
if 'inTrentino_source' in products_df.columns:
fake_product['inTrentino_source'] = True
if 'Frequency' in products_df.columns:
fake_product['Frequency'] = 0
fake_products.append(fake_product)
# Append the new products to the original set
fake_products_df = pd.DataFrame([pd.Series(fake_product) for fake_product in fake_products])
products_df = pd.concat([products_df, fake_products_df], axis = 0, ignore_index = True)
return products_df
def preProcessing_collectionColumns(orders_df, cols_to_split = ['Linked regions', 'Linked experiences', 'Linked recipes'],
delimiter = '|'):
actual_colsToSplit = [col for col in cols_to_split if col in list(orders_df.columns)]
for col_name in actual_colsToSplit:
# Split the strings into items
orders_df[col_name] = orders_df[col_name].str.split(delimiter)
# Fill NaN values with empty list
orders_df[col_name] = orders_df[col_name].apply(
lambda item: list() if isinstance(item, float) else item)
# Remove potential whitespaces
orders_df[col_name] = orders_df[col_name].apply(
lambda list: [item.strip() for item in list])
# Normalize item names
orders_df[col_name] = orders_df[col_name].apply(
lambda items: [item.capitalize() for item in items])
return orders_df