Your IP : 216.73.217.13


Current Path : /home/deltalab/PMS/recommendations/recommender-system-batch/components/
Upload File :
Current File : //home/deltalab/PMS/recommendations/recommender-system-batch/components/semiRandom_RS.py

from collections import defaultdict

import numpy as np
import pandas as pd
from tabulate import tabulate

from _library.toolkit import jaccard_similarity


class semiRandom_RS:
    
    def __init__(self, products, categories, filter_source_platform):
        self.products = products.copy()
        self.categories = categories
        
        self.filter_source_platform = filter_source_platform
        self.rs_codeName = 'randomProducts'
        
        self.verbose = False
        
    def getParentCategories(self, products, category_name, minItems = 8):
        allParentCategories = defaultdict(list)
        
        #self.verbose = True
         
        # Visit the parents until the root category
        category_to_visit = category_name.upper()
        while(category_to_visit != None):
            cond = self.categories['categoryName'].str.upper() == category_to_visit 
            parent_category = self.categories.loc[cond, 'parent']
            
            if len(parent_category) == 0:
                category_to_visit = None
            else:
                parent_category = parent_category.values[0]
                print(category_to_visit, "-->", parent_category)
                
                # Get its leaves
                cond = self.categories['parent'].str.upper() == parent_category.upper()
                parent_leaves = self.categories.loc[cond, 'categoryName'].values
                
                allParentCategories[parent_category].extend(parent_leaves)

                category_to_visit = parent_category
        
        if self.verbose:
            print("\nCATEGORY:", category_name, f"(TARGET: {minItems} items)")
            print(f"--> ALL PARENTS ({len(allParentCategories.keys())}):", 
                  ' | '.join(allParentCategories.keys()))
            

        # Check if there are products for these categories
        allConnectedCategories = defaultdict(dict)
        getItems = lambda df, type: df.loc[df['Product Type'].str.upper() == type.upper(), :]
        for idk, (parent_type, parent_leaves) in enumerate(allParentCategories.copy().items()):
            
            parent_items = getItems(products, parent_type)
            
            if len(parent_items) > 0:
                allConnectedCategories[idk][parent_type] = 0
                if self.verbose:
                    print(f"{parent_type} [root]: {len(parent_items)} items")
            else:
                for leaf_type in parent_leaves:
                    leaf_items = getItems(products, leaf_type)
                    
                    if len(leaf_items) > 0:
                        allConnectedCategories[idk][leaf_type] = len(leaf_items)
                        if self.verbose:
                            print(f"{leaf_type} [leaf]: {len(leaf_type)} items")
                        
        allConnectedCategories = dict(sorted(allConnectedCategories.items(),
                                             key = lambda item: item[0]))
        
        print("\nallConnectedCategories:", allConnectedCategories)
        if self.verbose:
            filteredParents = [type_name for parent_level in allConnectedCategories.values() 
                               for type_name in parent_level.keys()]
            print(f"--> FILTERED PARENTS ({len(filteredParents)}):", 
                  list(allConnectedCategories.values()))
            
        if len(allConnectedCategories.keys()) == 0:
            
            # Retrieve all the categories
            allTypes = list(products['Product Type'].str.upper().unique())
                
            # Rank the categories according to its similarity with the reference type
            rankedTypes = dict()
            for type in allTypes:
                similarities = [jaccard_similarity(type, category_name)]
                for leavesCategories in allConnectedCategories.values():
                    similarities.extend([jaccard_similarity(type, leavesCategory) for leavesCategory in leavesCategories])
                rankedTypes[type] = np.max(similarities)
            
            rankedTypes = dict(sorted(rankedTypes.items(), 
                                      key = lambda dictItem: (dictItem[1], dictItem[0]),
                                      reverse = True))
             
            if self.verbose:
                print("--> all ranked categories", rankedTypes)
                
            rankedTypes = np.array(list(rankedTypes.keys()))

            return rankedTypes
            
        connectedCategories = []
        attachedItems_counter = 0
        for categories in allConnectedCategories.values():
            
            # Retrive the category names
            categoriy_names = list(categories.keys())
            connectedCategories.extend(categoriy_names)
            
            # Count the attached items
            total_categoryItems = np.sum(list(categories.values()))
            attachedItems_counter += total_categoryItems
            
            if attachedItems_counter >= minItems:
                break
        
        if self.verbose:
            print(f"--> MIN FILTERED PARENTS ({len(connectedCategories)}):", 
                  ' | '.join(connectedCategories))
            
        return np.array(connectedCategories)
    
    def generate_filtered_productsDF(self, product_sku, previous_recommendedSKUs, num_randomItems):
        
        products_df = self.products
        
        # Get product info
        item_info = products_df.loc[products_df['SKU'] == product_sku].iloc[0]
        
        item_vendor = item_info['Vendor']
        item_platformSource = item_info['inTrentino_source']
        item_type = item_info['Product Type']
        product_name = item_info['Title']
        
        print("\n\nREFERENCE")
        print(item_info)
        
        if self.verbose: 
            print(f"\n[{item_type}, {item_vendor}] {product_sku}")
            print(f"--> Already recommended ({len(previous_recommendedSKUs)}):", previous_recommendedSKUs)
        
        # [Filter products] Exclude the referenece product 
        item_idx = products_df.loc[products_df['SKU'] == product_sku, :].index
        products_df = products_df.drop(index = item_idx)
        
        # [Filter products] Remove the items already considered
        products_df = products_df.drop(index = products_df[products_df['SKU'].isin(previous_recommendedSKUs)].index)
        
        # [Filter products] Filter the products according to the source platform
        if self.filter_source_platform:
            products_df = products_df[products_df['inTrentino_source'] == item_platformSource].reset_index(drop = True)
            
            if self.verbose:
                print("--> (A) Filtering the source platfrom")
                print(products_df)
        
        # [TRY] Select, if possible, the products of the original product vendor
        # --------------------------------------------------------------------------
        filtered_products_df = products_df[products_df['Vendor'] == item_vendor].reset_index(drop = True)
        
        normalize = lambda item, max_val, min_val: (item - min_val + 0.001) / (max_val - min_val + 0.001)
        if len(filtered_products_df) > 0:
            
            # Number of items that should be filled 
            num_missing_items = num_randomItems - len(filtered_products_df)

            if num_missing_items > 0:
                
                # Get the parent categories 
                connectedCategories = self.getParentCategories(
                    products = products_df.drop(index = filtered_products_df.index), 
                    category_name = item_type, 
                    minItems = num_missing_items)
                extra_df = products_df[products_df['Product Type'].isin(connectedCategories)].copy()
                
                # Generate the weights
                extra_df['type_weights'] = extra_df['Product Type'].apply(
                    lambda type: len(connectedCategories) - np.argwhere(connectedCategories == type).item())
                
                extra_df['type_weights'] = extra_df['type_weights'].apply(
                    lambda weight: normalize(weight, extra_df['type_weights'].dropna().max(), 
                                             extra_df['type_weights'].dropna().min()))
        
                if self.verbose:
                    print("LIST EXTRA PRODUCTS \n", extra_df)
                
                # Get the extra items
                extra_df = extra_df.sort_values(by = ['type_weights', 'Frequency', 'Title'], ascending = False)
                extra_extra_items = extra_df.iloc[:num_missing_items, :]
                #extra_extra_items = extra_df.sample(n = num_missing_items, weights = extra_df['type_weights'].values)
                
                # Merge the extra items with the previous
                filtered_products_df = pd.concat([filtered_products_df, extra_extra_items]).reset_index(drop = True) 
                
                if self.verbose:
                    print(f"--> (B.2) Filtering using some items from the same "\
                        f"vendor with extra items ({len(extra_extra_items)}, {', '.join(connectedCategories)})")
                    print(filtered_products_df)
                    
            else:
                if self.verbose:
                    print(f"--> (B.1) Filtering using {len(filtered_products_df)} items from the same vendor")
                    print(products_df)
                    
            products_df = filtered_products_df
        # --------------------------------------------------------------------------
        else:
            
            # Get the parent categories 
            connectedCategories = self.getParentCategories(
                products = products_df.copy(), 
                category_name = item_type, 
                minItems = num_randomItems)
        
            # Filter the products df according to the parent categories
            products_df = products_df[products_df['Product Type'].isin(connectedCategories)]
            
            # Generate the weights
            products_df['type_weights'] = products_df['Product Type'].apply(
                lambda type: len(connectedCategories) - np.argwhere(connectedCategories == type).item())
            products_df['type_weights'] = products_df['type_weights'].apply(
                lambda weight: normalize(weight, products_df['type_weights'].dropna().max(), 
                                         products_df['type_weights'].dropna().min()))

            products_df = products_df.sort_values(by = 'type_weights', ascending = False)
            
            if self.verbose:
                print(f"--> (B.3) Filtering using the connected categories ({', '.join(connectedCategories)})")
                print(products_df)
                
        # Compute whether the selected products have the same vendor
        products_df['sameVendor'] = products_df['Vendor'] == item_vendor
        
        # Compute the dissimilarity
        dissimilarity_func = lambda item_name: 1 - jaccard_similarity(item_name.upper(), product_name.upper())
        products_df['reference_dissimilarity'] = products_df['Title'].apply(dissimilarity_func)

        if self.verbose:
            print(f"\nFiltered products ({len(products_df)}):")
            print(tabulate(products_df, headers = products_df.columns, tablefmt = 'pretty'))
            print("-" * 40)
        
        return products_df
        
    def retrieveProducts_randomly(self, products_df, num_randomItems):
        
        # Retrieve K random items
        if len(products_df) > 1:
            
            # Compute the weights  
            min_dissimilary = products_df['reference_dissimilarity'].min()
            max_dissimilary = products_df['reference_dissimilarity'].max()
            
            normalize = lambda item: (item - min_dissimilary + 0.001) / (max_dissimilary - min_dissimilary + 0.001)

            products_df['reference_dissimilarity'] = products_df['reference_dissimilarity'].apply(normalize)
            if 'type_weights' in products_df.columns:
                
                products_df['weight'] = products_df.apply(
                    func = lambda df_row: np.nanmean([
                        df_row['reference_dissimilarity'], 
                        df_row['type_weights']]),
                    axis = 1)
            else:
                 products_df['weight'] = products_df['reference_dissimilarity']
            
            # Uniform distribution
            if products_df['weight'].isnull().values.all():
                products_df['weight'] = [1/(idk + 1) for idk in range(len(products_df))]
            else:
                
                # Penalize other products
                cond = products_df['sameVendor'] == False
                products_df.loc[cond, 'weight'] /= 2
            
            # If for some reasons the number of products to sample exceeds 
            # the total number of products in product_df, sample the entire dataframe
            n_to_sample = num_randomItems
            if(num_randomItems > len(products_df)):
                n_to_sample = len(products_df)

            random_items = products_df.sample(
                n = n_to_sample, 
                weights = products_df['weight'].values,
                random_state = 101).reset_index(drop = True)
            
            # Sort the value
            random_items = random_items.sort_values(
                by = ['sameVendor', 'weight', 'Frequency', 'Title'], 
                ascending = False).reset_index(drop = True)
        else:
            random_items = products_df
        
        if self.verbose:
            print(f"\nRandom products ({len(random_items)}):")
            print(tabulate(random_items, headers = random_items.columns, tablefmt = 'pretty'))
            print("-" * 40)
        
        return random_items
        
    def generate_connectedProducts(self, product_sku, previous_recommendedSKUs, num_randomItems):

        if self.verbose: 
            print("\n" + "-" * 60)
            print("-" * 15, f"PRODUCT: {product_sku}", "-" * 15)
            print("-" * 60)
            
        explaination_string = "Ti potrebbe interessare"
            
        # Filter the products df according to some criteria
        filtered_products_df = self.generate_filtered_productsDF(product_sku, previous_recommendedSKUs, num_randomItems)
        
        # Retrieve some products randomly
        random_items = self.retrieveProducts_randomly(filtered_products_df, num_randomItems)
        random_items = random_items[['Vendor', 'SKU', 'Title', 'Product Type', 'Seller', 'inTrentino_source']].copy()
        
        # Add the necessary attributes 
        random_items['rank'] = [len(previous_recommendedSKUs) + (idk + 1) for idk in random_items.index]
        random_items['item_name'] =  random_items['Title']
        random_items['explaination'] = explaination_string
        random_items['rs_source'] = self.rs_codeName
        
        # Rename the columns as usual
        random_items = random_items.rename(
            columns = {
                'SKU': 'item_sku',
                'Title': 'product_name',
                'Product Type': 'item_type',
                'Vendor': 'item_vendor',
                'Seller': 'seller'
                }
            )
        
        # Turn the product into a list
        random_items = random_items.to_dict(orient = 'records')
        
        return random_items