File size: 2,365 Bytes
ac6138f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import os
import json

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def update_filters(filters, data):
    for key, value in filters.items():
        if data[key] and key == 'rating':
            value.add(data[key])
        else:
            for val in data[key]:
                if val:
                    value.add(val)
    return filters

def clean_filters(filters):
    for key, val in filters.items():
        val.add('ALL')
        filters[key] = list(val)
    return filters

if __name__ == '__main__':
    filters = {
    'genres': set(),
    'themes': set(),
    'rating': set()
    }

    embeddings = {}
    i=0
    for name in os.listdir('./anime'):
        with open(f"./anime/{name}", 'r') as file:
            data = json.load(file)

        if not data:
            continue

        i+=1
        if i==100:
            break

        filters = update_filters(filters, data)

        name = name.replace('.json', '')
        
        data['image'] = f"./images/{name}.jpg"

        text = f'''Episodes: {data['episodes']} 
                Premiered: {data['premiered']} 
                Broadcast: {data['broadcast']} 
                Producers: {' '.join(data['producers'])} 
                Licensors: {' '.join(data['licensors'])} 
                Studios: {' '.join(data['studios'])} 
                Source: {' '.join(data['source'])}  
                Genres: {' '.join(data['genres'])} 
                Themes: {' '.join(data['themes'])} 
                Demographic: {data['demographic']} 
                Duration: {data['duration']} 
                Rating: {data['rating']} 
                Description: {data['description']}'''
        
        embeddings[name] = data.copy()
        
        embeddings[name]['objective_embedding'] = [model.encode(text).tolist()]
        subjective_embeddings = []
        for review in embeddings[name]['reviews']:
            text = review['text']
            subjective_embeddings.append(model.encode(text).tolist())
            data['review'] = text
        embeddings[name]['subjective_embeddings'] = subjective_embeddings

    filters = clean_filters(filters)

    with open('./embeddings/data.json', 'w') as f:
        json.dump({'embeddings':embeddings, 'filters': filters}, f)

    print('Embedding Complete')