import os import json from sentence_transformers import SentenceTransformer model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') def update_filters(filters, data): for key, value in filters.items(): dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity'] if data[key] and key == 'rating': if data[key] not in dont_add: value.add(data[key]) else: for val in data[key]: if val and val not in dont_add: value.add(val) return filters def clean_filters(filters): for key, val in filters.items(): val.add('ALL') filters[key] = list(val) return filters if __name__ == '__main__': print('Embedding Started') filters = { 'genres': set(), 'themes': set(), 'rating': set() } embeddings = {} for name in os.listdir('./anime'): with open(f"./anime/{name}", 'r') as file: data = json.load(file) if not data: continue filters = update_filters(filters, data) name = name.replace('.json', '') data['image'] = f"./images/{name}.jpg" text = f''' This anime has {data['episodes']} Episodes | This anime premiered on {data['premiered']} | This anime was broadcasted on: {data['broadcast']} | This anime was produced by {' '.join(data['producers'])} | This anime was licensed by Licensors: {' '.join(data['licensors'])} | The studios in charge of this anime was {' '.join(data['studios'])} | The source of this anime was {' '.join(data['source'])} | The genres of this anime are {' '.join(data['genres'])} | The themes of this anime are {' '.join(data['themes'])} | The demographic of this anime is {data['demographic']} | The duration of this anime is {data['duration']} | The rating of this anime is {data['rating']} | The description of this anime is {data['description']}''' embeddings[name] = data.copy() embeddings[name]['objective_embedding'] = [model.encode(text).tolist()] subjective_embeddings = [] for review in embeddings[name]['reviews']: text = review['text'] subjective_embeddings.append(model.encode(text).tolist()) data['review'] = text embeddings[name]['subjective_embeddings'] = subjective_embeddings filters = clean_filters(filters) with open('./embeddings/data.json', 'w') as f: json.dump({'embeddings':embeddings, 'filters': filters}, f) print('Embedding Complete')