Update README.md
Browse files
README.md
CHANGED
@@ -45,47 +45,6 @@ and then applied fine-tuning with the mMARCO multilingual IR methodology on the
|
|
45 |
|
46 |
Although this model performs well and is state-of-the-art for now. But still this model is finetuned on mmarco model and a translated dataset(which was created using indicTrans2 model). Hence the limitations of those apply here too.
|
47 |
|
48 |
-
### Recommendations
|
49 |
-
|
50 |
-
|
51 |
-
## How to Get Started with the Model
|
52 |
-
|
53 |
-
Example Code for Scoring Query-Document Pairs:
|
54 |
-
In an IR setting, you provide a query and one or more candidate documents. The model scores each document for relevance to the query, which can be used for ranking.
|
55 |
-
```
|
56 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
57 |
-
import torch
|
58 |
-
|
59 |
-
# Load the tokenizer and model
|
60 |
-
tokenizer = AutoTokenizer.from_pretrained("Mavkif/urdu-mt5-mmarco")
|
61 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("Mavkif/urdu-mt5-mmarco")
|
62 |
-
|
63 |
-
# Define the query and candidate documents
|
64 |
-
query = "پاکستان کی معیشت کی موجودہ صورتحال کیا ہے؟"
|
65 |
-
document_1 = "پاکستان کی معیشت میں حالیہ ترقی کے بارے میں معلومات۔"
|
66 |
-
document_2 = "فٹبال پاکستان میں تیزی سے مقبول ہو رہا ہے۔"
|
67 |
-
|
68 |
-
# Tokenize query-document pairs and calculate relevance scores
|
69 |
-
def get_score(query, document):
|
70 |
-
input_text = f"Query: {query} Document: {document}"
|
71 |
-
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
|
72 |
-
|
73 |
-
# Pass through the model and get the relevance score (logits)
|
74 |
-
outputs = model(**inputs)
|
75 |
-
score = outputs.logits[0, -1, :] # last token logits
|
76 |
-
return torch.softmax(score, dim=0)[tokenizer.eos_token_id].item()
|
77 |
-
|
78 |
-
# Get scores for each document
|
79 |
-
score_1 = get_score(query, document_1)
|
80 |
-
score_2 = get_score(query, document_2)
|
81 |
-
|
82 |
-
print(f"Relevance Score for Document 1: {score_1}")
|
83 |
-
print(f"Relevance Score for Document 2: {score_2}")
|
84 |
-
|
85 |
-
# Higher score indicates higher relevance
|
86 |
-
|
87 |
-
```
|
88 |
-
|
89 |
|
90 |
|
91 |
## Evaluation
|
@@ -130,6 +89,91 @@ MRR @10 : 0.247
|
|
130 |
For more details on how to customize the decoding parameters (such as max_length, num_beams, and early_stopping), refer to the Hugging Face documentation.
|
131 |
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
## Model Card Authors [optional]
|
134 |
|
135 |
Umer Butt
|
@@ -137,4 +181,4 @@ Umer Butt
|
|
137 |
|
138 |
## Model Card Contact
|
139 |
|
140 |
-
mumertbutt@gmail.com
|
|
|
45 |
|
46 |
Although this model performs well and is state-of-the-art for now. But still this model is finetuned on mmarco model and a translated dataset(which was created using indicTrans2 model). Hence the limitations of those apply here too.
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
## Evaluation
|
|
|
89 |
For more details on how to customize the decoding parameters (such as max_length, num_beams, and early_stopping), refer to the Hugging Face documentation.
|
90 |
|
91 |
|
92 |
+
|
93 |
+
## How to Get Started with the Model
|
94 |
+
|
95 |
+
Example Code for Scoring Query-Document Pairs:
|
96 |
+
In an IR setting, you provide a query and one or more candidate documents. The model scores each document for relevance to the query, which can be used for ranking.
|
97 |
+
```
|
98 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
99 |
+
import torch
|
100 |
+
import torch.nn.functional as F
|
101 |
+
|
102 |
+
|
103 |
+
# Load the tokenizer and model
|
104 |
+
tokenizer = AutoTokenizer.from_pretrained("Mavkif/urdu-mt5-mmarco")
|
105 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("Mavkif/urdu-mt5-mmarco")
|
106 |
+
|
107 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
108 |
+
model.to(device)
|
109 |
+
|
110 |
+
|
111 |
+
def rank_documents(query, documents):
|
112 |
+
# Create input pairs of query and documents
|
113 |
+
query_document_pairs = [f"{query} [SEP] {doc}" for doc in documents]
|
114 |
+
|
115 |
+
# Tokenize the input pairs
|
116 |
+
inputs = tokenizer(query_document_pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
|
117 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
118 |
+
|
119 |
+
# Generate decoder input ids (starting with the decoder start token)
|
120 |
+
decoder_input_ids = torch.full(
|
121 |
+
(inputs["input_ids"].shape[0], 1), model.config.decoder_start_token_id, dtype=torch.long, device=device
|
122 |
+
)
|
123 |
+
|
124 |
+
# Perform inference to get the logits
|
125 |
+
with torch.no_grad():
|
126 |
+
outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
|
127 |
+
|
128 |
+
# Get the logits for the sequence output
|
129 |
+
logits = outputs.logits
|
130 |
+
|
131 |
+
# Extract the probabilities for the generated sequence
|
132 |
+
scores = []
|
133 |
+
for idx, doc in enumerate(documents):
|
134 |
+
# Calculate the softmax over the entire vocabulary for each token in the sequence
|
135 |
+
doc_logits = logits[idx]
|
136 |
+
doc_probs = F.softmax(doc_logits, dim=-1)
|
137 |
+
|
138 |
+
# Get the probability score for "ہاں" token in the output sequence
|
139 |
+
token_true_id = tokenizer.convert_tokens_to_ids("ہاں")
|
140 |
+
token_probs = doc_probs[:, token_true_id]
|
141 |
+
sum_prob = token_probs.sum().item() # Sum probability over the sequence
|
142 |
+
scores.append((doc, sum_prob)) # Use the summed probability directly as the score
|
143 |
+
|
144 |
+
# Normalize scores to be between 0 and 1
|
145 |
+
max_score = max(score for _, score in scores)
|
146 |
+
min_score = min(score for _, score in scores)
|
147 |
+
normalized_scores = [((score - min_score) / (max_score - min_score) if max_score > min_score else 0.5) for _, score in scores]
|
148 |
+
|
149 |
+
# Create a list of documents with normalized scores
|
150 |
+
ranked_documents = [(documents[idx], normalized_scores[idx]) for idx in range(len(documents))]
|
151 |
+
|
152 |
+
# Sort documents based on scores (descending order)
|
153 |
+
ranked_documents = sorted(ranked_documents, key=lambda x: x[1], reverse=True)
|
154 |
+
return ranked_documents
|
155 |
+
|
156 |
+
|
157 |
+
# Example query and documents
|
158 |
+
query = "پاکستان کی معیشت کی موجودہ صورتحال کیا ہے؟"
|
159 |
+
documents = [
|
160 |
+
"پاکستان ��ی معیشت میں بہتری کے اشارے ہیں۔",
|
161 |
+
"زر مبادلہ کے ذخائر میں کمی دیکھی گئی ہے۔",
|
162 |
+
"فٹبال پاکستان میں تیزی سے مقبول ہو رہا ہے۔"
|
163 |
+
]
|
164 |
+
|
165 |
+
# Get ranked documents
|
166 |
+
ranked_docs = rank_documents(query, documents)
|
167 |
+
|
168 |
+
# Print the ranked documents
|
169 |
+
for idx, (doc, score) in enumerate(ranked_docs):
|
170 |
+
print(f"Rank {idx + 1}: Score: {score}, Document: {doc}")
|
171 |
+
|
172 |
+
|
173 |
+
```
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
## Model Card Authors [optional]
|
178 |
|
179 |
Umer Butt
|
|
|
181 |
|
182 |
## Model Card Contact
|
183 |
|
184 |
+
mumertbutt@gmail.com
|