|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
test_df = pd.read_csv("/tmp/data/test.csv") |
|
|
|
with open("model.pkl", "rb") as f: |
|
model = pickle.load(f) |
|
|
|
scores = [] |
|
for _, row in test_df.iterrows(): |
|
X_query = model["tokenizer"].transform([row["Query"]]) |
|
is_cand = sum([(model["faq_ids"] == row[f"FAQ{i+1}"]).astype(int) for i in range(3)]) > 0 |
|
sim = cosine_similarity(X_query, model["X_faq"][is_cand])[0] |
|
score = sim.max() |
|
scores.append(score) |
|
|
|
predict = (np.array(scores) > model["thr"]).astype(int) |
|
|
|
df = pd.DataFrame([(f"testid{i:04}", v) for i, v in enumerate(predict)], columns=["id", "pred"]) |
|
df.to_csv("submission.csv", index=None) |