miscjose's picture
Uploading local repo
ac6138f
raw
history blame
2.37 kB
import os
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def update_filters(filters, data):
for key, value in filters.items():
if data[key] and key == 'rating':
value.add(data[key])
else:
for val in data[key]:
if val:
value.add(val)
return filters
def clean_filters(filters):
for key, val in filters.items():
val.add('ALL')
filters[key] = list(val)
return filters
if __name__ == '__main__':
filters = {
'genres': set(),
'themes': set(),
'rating': set()
}
embeddings = {}
i=0
for name in os.listdir('./anime'):
with open(f"./anime/{name}", 'r') as file:
data = json.load(file)
if not data:
continue
i+=1
if i==100:
break
filters = update_filters(filters, data)
name = name.replace('.json', '')
data['image'] = f"./images/{name}.jpg"
text = f'''Episodes: {data['episodes']}
Premiered: {data['premiered']}
Broadcast: {data['broadcast']}
Producers: {' '.join(data['producers'])}
Licensors: {' '.join(data['licensors'])}
Studios: {' '.join(data['studios'])}
Source: {' '.join(data['source'])}
Genres: {' '.join(data['genres'])}
Themes: {' '.join(data['themes'])}
Demographic: {data['demographic']}
Duration: {data['duration']}
Rating: {data['rating']}
Description: {data['description']}'''
embeddings[name] = data.copy()
embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]
subjective_embeddings = []
for review in embeddings[name]['reviews']:
text = review['text']
subjective_embeddings.append(model.encode(text).tolist())
data['review'] = text
embeddings[name]['subjective_embeddings'] = subjective_embeddings
filters = clean_filters(filters)
with open('./embeddings/data.json', 'w') as f:
json.dump({'embeddings':embeddings, 'filters': filters}, f)
print('Embedding Complete')