File size: 2,365 Bytes
ac6138f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import os
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def update_filters(filters, data):
for key, value in filters.items():
if data[key] and key == 'rating':
value.add(data[key])
else:
for val in data[key]:
if val:
value.add(val)
return filters
def clean_filters(filters):
for key, val in filters.items():
val.add('ALL')
filters[key] = list(val)
return filters
if __name__ == '__main__':
filters = {
'genres': set(),
'themes': set(),
'rating': set()
}
embeddings = {}
i=0
for name in os.listdir('./anime'):
with open(f"./anime/{name}", 'r') as file:
data = json.load(file)
if not data:
continue
i+=1
if i==100:
break
filters = update_filters(filters, data)
name = name.replace('.json', '')
data['image'] = f"./images/{name}.jpg"
text = f'''Episodes: {data['episodes']}
Premiered: {data['premiered']}
Broadcast: {data['broadcast']}
Producers: {' '.join(data['producers'])}
Licensors: {' '.join(data['licensors'])}
Studios: {' '.join(data['studios'])}
Source: {' '.join(data['source'])}
Genres: {' '.join(data['genres'])}
Themes: {' '.join(data['themes'])}
Demographic: {data['demographic']}
Duration: {data['duration']}
Rating: {data['rating']}
Description: {data['description']}'''
embeddings[name] = data.copy()
embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]
subjective_embeddings = []
for review in embeddings[name]['reviews']:
text = review['text']
subjective_embeddings.append(model.encode(text).tolist())
data['review'] = text
embeddings[name]['subjective_embeddings'] = subjective_embeddings
filters = clean_filters(filters)
with open('./embeddings/data.json', 'w') as f:
json.dump({'embeddings':embeddings, 'filters': filters}, f)
print('Embedding Complete')
|