|
import os |
|
import json |
|
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
|
|
|
def update_filters(filters, data): |
|
for key, value in filters.items(): |
|
dont_add = ['Boys Love', 'Erotica', 'Girls Love', 'Hentai', 'Ecchi', 'Gore', 'Crossdressing', 'Magical Sex Shift', 'Rx - Hentai', 'R+ - Mild Nudity'] |
|
if data[key] and key == 'rating': |
|
if data[key] not in dont_add: |
|
value.add(data[key]) |
|
else: |
|
for val in data[key]: |
|
if val and val not in dont_add: |
|
value.add(val) |
|
return filters |
|
|
|
def clean_filters(filters): |
|
for key, val in filters.items(): |
|
val.add('ALL') |
|
filters[key] = list(val) |
|
return filters |
|
|
|
if __name__ == '__main__': |
|
|
|
print('Embedding Started') |
|
|
|
filters = { |
|
'genres': set(), |
|
'themes': set(), |
|
'rating': set() |
|
} |
|
|
|
embeddings = {} |
|
for name in os.listdir('./anime'): |
|
with open(f"./anime/{name}", 'r') as file: |
|
data = json.load(file) |
|
|
|
if not data: |
|
continue |
|
|
|
filters = update_filters(filters, data) |
|
|
|
name = name.replace('.json', '') |
|
|
|
data['image'] = f"./images/{name}.jpg" |
|
|
|
text = f''' |
|
This anime has {data['episodes']} Episodes | |
|
This anime premiered on {data['premiered']} | |
|
This anime was broadcasted on: {data['broadcast']} | |
|
This anime was produced by {' '.join(data['producers'])} | |
|
This anime was licensed by Licensors: {' '.join(data['licensors'])} | |
|
The studios in charge of this anime was {' '.join(data['studios'])} | |
|
The source of this anime was {' '.join(data['source'])} | |
|
The genres of this anime are {' '.join(data['genres'])} | |
|
The themes of this anime are {' '.join(data['themes'])} | |
|
The demographic of this anime is {data['demographic']} | |
|
The duration of this anime is {data['duration']} | |
|
The rating of this anime is {data['rating']} | |
|
The description of this anime is {data['description']}''' |
|
|
|
embeddings[name] = data.copy() |
|
|
|
embeddings[name]['objective_embedding'] = [model.encode(text).tolist()] |
|
subjective_embeddings = [] |
|
for review in embeddings[name]['reviews']: |
|
text = review['text'] |
|
subjective_embeddings.append(model.encode(text).tolist()) |
|
data['review'] = text |
|
embeddings[name]['subjective_embeddings'] = subjective_embeddings |
|
|
|
filters = clean_filters(filters) |
|
|
|
with open('./embeddings/data.json', 'w') as f: |
|
json.dump({'embeddings':embeddings, 'filters': filters}, f) |
|
|
|
print('Embedding Complete') |
|
|