Qatar_Rec_Ours / app.py
Rubens's picture
commit our data
70950be
raw
history blame
6.75 kB
import os
import pprint
import tempfile
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import os
import unidecode
from nltk import word_tokenize
import re
import pandas as pd
from nltk.util import ngrams
import base64
import hashlib
import gradio as gr
import scann
df=pd.read_csv("/home/user/app/Qatar_translated_best_2500.csv",sep=",",header=0)
df=df.drop_duplicates()
df=df.dropna()
df["nome_vaga"]=df["nome_vaga"].map(lambda x: x.lower().title())
df["requisito"]=df["requisito"].map(lambda x: x[0:1000].lower())
my_dict=dict(df.iloc[0:int(df.shape[0]*0.9),:])
my_dict_cego=dict(df.iloc[int(df.shape[0]*0.9):,:])
ratings = tf.data.Dataset.from_tensor_slices(my_dict).map(lambda x: {
"code": x["code"],
"nome_vaga": x["nome_vaga"],
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
})
movies = tf.data.Dataset.from_tensor_slices(dict(df)).map(lambda x: {
"code": x["code"],
"nome_vaga": x["nome_vaga"]
})
movies = movies.map(lambda x: x["code"])
ratings_cego = tf.data.Dataset.from_tensor_slices(my_dict_cego).map(lambda x: {
"code": x["code"],
"requisito": tf.strings.split(x["requisito"],maxsplit=106)
})
tf.random.set_seed(42)
shuffled = ratings.shuffle(int(df.shape[0]*0.9), seed=42, reshuffle_each_iteration=False)
shuffled2 = ratings_cego.shuffle(int(df.shape[0]*0.1), seed=42, reshuffle_each_iteration=False)
train = shuffled.take(int(df.shape[0]*0.9))
test = shuffled.take(int(df.shape[0]*0.1))
cego=shuffled2
movie_titles = movies#.map(lambda x: x["code"])
user_ids = ratings.map(lambda x: x["requisito"])
xx=[]
for x in user_ids.as_numpy_iterator():
try:
xx.append(x)
except:
pass
unique_movie_titles = np.unique(list(movie_titles.as_numpy_iterator()))
unique_user_ids = np.unique(np.concatenate(xx))
user_ids=user_ids.batch(int(df.shape[0]*0.9))
layer = tf.keras.layers.StringLookup(vocabulary=unique_user_ids)
unique_movie_titles[:10]
embedding_dimension = 768
user_model = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
# We add an additional embedding to account for unknown tokens.
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
])
movie_model = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_movie_titles, mask_token=None),
tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])
metrics = tfrs.metrics.FactorizedTopK(
candidates=movies.batch(df.shape[0]
).map(movie_model)
)
task = tfrs.tasks.Retrieval(
metrics=metrics
)
class MovielensModel(tfrs.Model):
def __init__(self, user_model, movie_model):
super().__init__()
self.movie_model: tf.keras.Model = movie_model
self.user_model: tf.keras.Model = user_model
self.task: tf.keras.layers.Layer = task
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["requisito"])
positive_movie_embeddings = self.movie_model(features["code"])
return self.task(tf.reduce_sum(user_embeddings,axis=1), positive_movie_embeddings)
class NoBaseClassMovielensModel(tf.keras.Model):
def __init__(self, user_model, movie_model):
super().__init__()
self.movie_model: tf.keras.Model = movie_model
self.user_model: tf.keras.Model = user_model
self.task: tf.keras.layers.Layer = task
def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
with tf.GradientTape() as tape:
user_embeddings = self.user_model(features["requisito"])
positive_movie_embeddings = self.movie_model(features["code"])
loss = self.task(user_embeddings, positive_movie_embeddings)
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
gradients = tape.gradient(total_loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
user_embeddings = self.user_model(features["requisito"])
positive_movie_embeddings = self.movie_model(features["code"])
loss = self.task(user_embeddings, positive_movie_embeddings)
regularization_loss = sum(self.losses)
total_loss = loss + regularization_loss
metrics = {metric.name: metric.result() for metric in self.metrics}
metrics["loss"] = loss
metrics["regularization_loss"] = regularization_loss
metrics["total_loss"] = total_loss
return metrics
model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.08))
cached_train = train.shuffle(int(df.shape[0]*0.9)).batch(int(df.shape[0]*0.9)).cache()
cached_test = test.batch(int(df.shape[0]*0.15)).cache()
path = os.path.join("/home/user/app/", "model/")
cp_callback = tf.keras.callbacks.ModelCheckpoint(
filepath=path,
verbose=1,
save_weights_only=True,
save_freq=2)
model.fit(cached_train, callbacks=[cp_callback],epochs=110)
index=df["code"].map(lambda x: [model.movie_model(tf.constant(x))])
indice=[]
for i in range(0,1633):
indice.append(np.array(index)[i][0])
searcher = scann.scann_ops_pybind.builder(np.array(indice), 10, "dot_product").tree(
num_leaves=1500, num_leaves_to_search=500, training_sample_size=df.shape[0]).score_brute_force(
2, quantize=True).build()
import matplotlib.pyplot as plt
def predict(text):
campos=str(text).lower()
query=np.sum([model.user_model(tf.constant(campos.split()[i])) for i in range(0,len(campos.split()))],axis=0)
neighbors, distances = searcher.search_batched([query])
xx = df.iloc[neighbors[0],:].nome_vaga
fig = plt.figure(figsize=(14,9))
plt.bar(list(xx),distances[0]*0.8*10)
plt.title('Degree of match')
plt.xlabel('Labels')
plt.xticks(rotation=270)
plt.ylabel('Distances')
for x, y in zip(list(range(0,10)),distances[0]*0.8*10):
plt.text(x, y, y, ha='center', va='bottom', fontsize=12, color='black')
return xx, fig
demo = gr.Interface(fn=predict, inputs=gr.inputs.Textbox(label='CANDIDATE COMPETENCES - Click *Clear* before adding new input'), \
outputs=[gr.outputs.Textbox(label='SUGGESTED VACANCIES'),\
gr.Plot()],\
css='div {margin-left: auto; margin-right: auto; width: 100%;\
background-image: url("https://drive.google.com/uc?export=view&id=1KNnISAUcvh2Pt08f-EJZJYCIgkrKw3PI"); repeat 0 0;}')\
.launch(share=False)