Spaces:

snap-stanford
/

SKB-Explorer

Running on CPU Upgrade

App Files Files Community

devStorm commited on May 11

Commit

25e32a2

•

1 Parent(s): d123e86

feat: 🎨 color metadata

Browse files

Files changed (2) hide show

src/benchmarks/get_semistruct.py +0 -1
src/benchmarks/semistruct/amazon.py +80 -14

src/benchmarks/get_semistruct.py CHANGED Viewed

@@ -8,7 +8,6 @@ def get_semistructured_data(name, root='data/', download_processed=True, **kwarg
         categories = ['Sports_and_Outdoors']
         kb = AmazonSemiStruct(root=data_root,
                               categories=categories,
-                              meta_link_types=['brand', 'category'],
                               download_processed=download_processed,
                               **kwargs
                               )

         categories = ['Sports_and_Outdoors']
         kb = AmazonSemiStruct(root=data_root,
                               categories=categories,
                               download_processed=download_processed,
                               **kwargs
                               )

src/benchmarks/semistruct/amazon.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 import torch
 import pandas as pd
 import numpy as np
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 import zipfile
@@ -51,19 +52,20 @@ class AmazonSemiStruct(SemiStructureKB):
     sub_category = 'data/amazon/stats/category_list.json'
     SUB_CATEGORIES = set(json.load(open(sub_category, 'r')))
     link_columns = ['also_buy', 'also_view']
-    review_columns = ['reviewerID', 'summary', 'reviewText', 'vote', 'overall', 'verified', 'reviewTime']
     qa_columns = ['questionType', 'answerType', 'question', 'answer', 'answerTime']
     meta_columns = ['asin', 'title', 'global_category', 'category', 'price', 'brand', 'feature',
                     'rank', 'details', 'description']
     candidate_types = ['product']
     node_attr_dict = {'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'],
                        'brand': ['brand_name'],
-                       'category': ['category_name']}
     def __init__(self,
                  root,
                  categories: list,
-                 meta_link_types=['category'],
                  max_entries=25,
                  download_processed=True,
                  **kwargs):
@@ -117,10 +119,6 @@ class AmazonSemiStruct(SemiStructureKB):
     def __getitem__(self, idx):
         idx = int(idx)
         node_info = self.node_info[idx]
-        # try:
-        #     dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ')
-        #     node_info['dimensions'], node_info['weight'] = dimensions, weight
-        # except: pass
         node = Node()
         register_node(node, node_info)
         return node
@@ -173,6 +171,8 @@ class AmazonSemiStruct(SemiStructureKB):
             return f'brand name: {self[idx].brand_name}'
         if self.node_type_dict[int(self.node_types[idx])] == 'category':
             return f'category name: {self[idx].category_name}'
         node = self[idx]
         doc = f'- product: {node.title}\n'
@@ -370,9 +370,9 @@ class AmazonSemiStruct(SemiStructureKB):
         n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict)
         for i, link_type in enumerate(meta_link_types):
             if link_type == 'brand':
-                values = np.array([self._process_brand(node_info_i[link_type]) for node_info_i in node_info.values() if link_type in node_info_i.keys()])
                 indices = np.array([idx for idx, node_info_i in enumerate(node_info.values()) if link_type in node_info_i.keys()])
-            elif link_type == 'category':
                 value_list = []
                 indice_list = []
                 for idx, node_info_i in enumerate(node_info.values()):
@@ -381,9 +381,6 @@ class AmazonSemiStruct(SemiStructureKB):
                         indice_list.extend([idx for _ in range(len(node_info_i[link_type]))])
                 values = np.array(value_list)
                 indices = np.array(indice_list)
-                print(f'{link_type=}, {len(values)=}, {len(indices)=}')
-                # print(values[:50])
-                print(indices[:50])
             else:
                 raise Exception(f'Invalid meta link type {link_type}')
@@ -391,13 +388,15 @@ class AmazonSemiStruct(SemiStructureKB):
             node_type_dict[n_n_types + i] = link_type
             edge_type_dict[n_e_types + i] = "has_" + link_type
             unique = np.unique(values)
-            for j, unique_j in enumerate(unique):
                 node_info[cur_n_nodes + j] = {link_type + '_name': unique_j}
                 ids = indices[np.array(values == unique_j)]
                 edge_index[0].extend(list(ids))
                 edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))])
                 edge_types.extend([i + n_e_types for _ in range(len(ids))])
             node_types.extend([n_n_types + i for _ in range(len(unique))])
         edge_index = torch.LongTensor(edge_index)
         edge_types = torch.LongTensor(edge_types)
         node_types = torch.LongTensor(node_types)
@@ -431,6 +430,72 @@ class AmazonSemiStruct(SemiStructureKB):
             node_info[idx]['review'] = []
             node_info[idx]['qa'] = []
         for i in tqdm(range(len(df_meta))):
             df_meta_i = df_meta.iloc[i]
             asin = df_meta_i['asin']
@@ -450,7 +515,7 @@ class AmazonSemiStruct(SemiStructureKB):
                         node_info[idx]['category'] = category_list
                 else:
                     node_info[idx][column] = clean_data(df_meta_i[column])
         for name, df in zip(['review', 'qa'], [df_review, df_qa]):
             for i in tqdm(range(len(df))):
                 df_i = df.iloc[i]
@@ -459,6 +524,7 @@ class AmazonSemiStruct(SemiStructureKB):
                 node_info[idx][name].append(
                     df_row_to_dict(df_i, colunm_names=self.review_columns \
                                    if name == 'review' else self.qa_columns))
         return node_info
     def create_raw_product_graph(self, df, columns):

 import torch
 import pandas as pd
 import numpy as np
+from collections import Counter
 from tqdm import tqdm
 from huggingface_hub import hf_hub_download
 import zipfile
     sub_category = 'data/amazon/stats/category_list.json'
     SUB_CATEGORIES = set(json.load(open(sub_category, 'r')))
     link_columns = ['also_buy', 'also_view']
+    review_columns = ['reviewerID', 'summary', 'style', 'reviewText', 'vote', 'overall', 'verified', 'reviewTime']
     qa_columns = ['questionType', 'answerType', 'question', 'answer', 'answerTime']
     meta_columns = ['asin', 'title', 'global_category', 'category', 'price', 'brand', 'feature',
                     'rank', 'details', 'description']
     candidate_types = ['product']
     node_attr_dict = {'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'],
                        'brand': ['brand_name'],
+                       'category': ['category_name'],
+                       'color': ['color_name']}
     def __init__(self,
                  root,
                  categories: list,
+                 meta_link_types=['brand', 'category', 'color'],
                  max_entries=25,
                  download_processed=True,
                  **kwargs):
     def __getitem__(self, idx):
         idx = int(idx)
         node_info = self.node_info[idx]
         node = Node()
         register_node(node, node_info)
         return node
             return f'brand name: {self[idx].brand_name}'
         if self.node_type_dict[int(self.node_types[idx])] == 'category':
             return f'category name: {self[idx].category_name}'
+        if self.node_type_dict[int(self.node_types[idx])] == 'color':
+            return f'color name: {self[idx].color_name}'
         node = self[idx]
         doc = f'- product: {node.title}\n'
         n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict)
         for i, link_type in enumerate(meta_link_types):
             if link_type == 'brand':
+                values = np.array([node_info_i[link_type] for node_info_i in node_info.values() if link_type in node_info_i.keys()])
                 indices = np.array([idx for idx, node_info_i in enumerate(node_info.values()) if link_type in node_info_i.keys()])
+            elif link_type in ['category', 'color']:
                 value_list = []
                 indice_list = []
                 for idx, node_info_i in enumerate(node_info.values()):
                         indice_list.extend([idx for _ in range(len(node_info_i[link_type]))])
                 values = np.array(value_list)
                 indices = np.array(indice_list)
             else:
                 raise Exception(f'Invalid meta link type {link_type}')
             node_type_dict[n_n_types + i] = link_type
             edge_type_dict[n_e_types + i] = "has_" + link_type
             unique = np.unique(values)
+            for j, unique_j in tqdm(enumerate(unique)):
                 node_info[cur_n_nodes + j] = {link_type + '_name': unique_j}
                 ids = indices[np.array(values == unique_j)]
                 edge_index[0].extend(list(ids))
                 edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))])
                 edge_types.extend([i + n_e_types for _ in range(len(ids))])
             node_types.extend([n_n_types + i for _ in range(len(unique))])
+            print(f'finished adding {link_type}')
         edge_index = torch.LongTensor(edge_index)
         edge_types = torch.LongTensor(edge_types)
         node_types = torch.LongTensor(node_types)
             node_info[idx]['review'] = []
             node_info[idx]['qa'] = []
+        ###################### Assign color ########################
+        def assign_colors(df_review, lower_limit=20):
+            # asign to color
+            df_review = df_review[['asin', 'style']]
+            df_review = df_review.dropna(subset=['style'])
+            raw_color_dict = {}
+            for idx, row in tqdm(df_review.iterrows()):
+                asin, style = row['asin'], row['style']
+                for key in style.keys():
+                    if 'color' in key.lower():
+                        try:
+                            raw_color_dict[asin]
+                        except:
+                            raw_color_dict[asin] = []
+                        raw_color_dict[asin].append(
+                            style[key].strip().lower() if isinstance(style[key], str) else style[key][0].strip())
+            all_color_values = []
+            for asin in raw_color_dict.keys():
+                raw_color_dict[asin] = list(set(raw_color_dict[asin]))
+                all_color_values.extend(raw_color_dict[asin])
+            print('number of all colors', len(all_color_values))
+            color_counter = Counter(all_color_values)
+            print('number of unique colors', len(color_counter))
+            color_counter = {k: v for k, v in sorted(color_counter.items(), key=lambda item: item[1], reverse=True)}
+            selected_colors = []
+            for color, number in color_counter.items():
+                if number > lower_limit and len(color) > 2 and len(color.split(' ')) < 5 and color.isnumeric() is False:
+                    selected_colors.append(color)
+            print('number of selected colors', len(selected_colors))
+            filtered_color_dict = {}
+            total_color_connections = 0
+            for asin in raw_color_dict.keys():
+                filtered_color_dict[asin] = []
+                for value in raw_color_dict[asin]:
+                    if value in selected_colors:
+                        filtered_color_dict[asin].append(value)
+                total_color_connections += len(filtered_color_dict[asin])
+            print('number of linked products', len(filtered_color_dict))
+            print('number of total connections', total_color_connections)
+            return filtered_color_dict
+        filtered_color_dict_path = os.path.join('data/amazon/intermediate',
+                                                'filtered_color_dict.pkl')
+        if os.path.exists(filtered_color_dict_path):
+            with open(filtered_color_dict_path, 'rb') as f:
+                filtered_color_dict = pickle.load(f)
+        else:
+            filtered_color_dict = assign_colors(df_review)
+            with open(filtered_color_dict_path, 'wb') as f:
+                pickle.dump(filtered_color_dict, f)
+        for i in tqdm(range(len(df_meta))):
+            df_meta_i = df_meta.iloc[i]
+            asin = df_meta_i['asin']
+            idx = self.asin2id[asin]
+            try:
+                color = filtered_color_dict[asin]
+                if len(color):
+                    node_info[idx]['color'] = color
+            except: pass
+        print('loaded color')
+        ####################################################################
         for i in tqdm(range(len(df_meta))):
             df_meta_i = df_meta.iloc[i]
             asin = df_meta_i['asin']
                         node_info[idx]['category'] = category_list
                 else:
                     node_info[idx][column] = clean_data(df_meta_i[column])
         for name, df in zip(['review', 'qa'], [df_review, df_qa]):
             for i in tqdm(range(len(df))):
                 df_i = df.iloc[i]
                 node_info[idx][name].append(
                     df_row_to_dict(df_i, colunm_names=self.review_columns \
                                    if name == 'review' else self.qa_columns))
+        import pdb; pdb.set_trace()
         return node_info
     def create_raw_product_graph(self, df, columns):