|
--- |
|
language: |
|
- en |
|
- ar |
|
- cs |
|
- de |
|
- es |
|
- fr |
|
- it |
|
- ja |
|
- ko |
|
- nl |
|
- pt |
|
- zh |
|
license: apache-2.0 |
|
library_name: transformers |
|
tags: |
|
- language |
|
- granite |
|
- embeddings |
|
- multilingual |
|
model-index: |
|
- name: ibm-granite/granite-embedding-278m-multilingual |
|
results: |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (en) |
|
config: en |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.45557 |
|
- type: ndcg_at_10 |
|
value: 0.49372 |
|
- type: ndcg_at_100 |
|
value: 0.5728 |
|
- type: ndcg_at_1000 |
|
value: 0.59187 |
|
- type: ndcg_at_20 |
|
value: 0.52863 |
|
- type: ndcg_at_3 |
|
value: 0.43969 |
|
- type: ndcg_at_5 |
|
value: 0.45551 |
|
- type: recall_at_1 |
|
value: 0.21785 |
|
- type: recall_at_10 |
|
value: 0.59513 |
|
- type: recall_at_100 |
|
value: 0.85785 |
|
- type: recall_at_1000 |
|
value: 0.96041 |
|
- type: recall_at_20 |
|
value: 0.69357 |
|
- type: recall_at_3 |
|
value: 0.40403 |
|
- type: recall_at_5 |
|
value: 0.48499 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (ar) |
|
config: ar |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.57459 |
|
- type: ndcg_at_10 |
|
value: 0.64238 |
|
- type: ndcg_at_100 |
|
value: 0.6867 |
|
- type: ndcg_at_1000 |
|
value: 0.6951 |
|
- type: ndcg_at_20 |
|
value: 0.66455 |
|
- type: ndcg_at_3 |
|
value: 0.58162 |
|
- type: ndcg_at_5 |
|
value: 0.60831 |
|
- type: recall_at_1 |
|
value: 0.38064 |
|
- type: recall_at_10 |
|
value: 0.75098 |
|
- type: recall_at_100 |
|
value: 0.91203 |
|
- type: recall_at_1000 |
|
value: 0.96706 |
|
- type: recall_at_20 |
|
value: 0.81978 |
|
- type: recall_at_3 |
|
value: 0.58618 |
|
- type: recall_at_5 |
|
value: 0.66353 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (bn) |
|
config: bn |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.60341 |
|
- type: ndcg_at_10 |
|
value: 0.68055 |
|
- type: ndcg_at_100 |
|
value: 0.72008 |
|
- type: ndcg_at_1000 |
|
value: 0.72716 |
|
- type: ndcg_at_20 |
|
value: 0.69914 |
|
- type: ndcg_at_3 |
|
value: 0.60805 |
|
- type: ndcg_at_5 |
|
value: 0.64486 |
|
- type: recall_at_1 |
|
value: 0.37948 |
|
- type: recall_at_10 |
|
value: 0.80609 |
|
- type: recall_at_100 |
|
value: 0.94305 |
|
- type: recall_at_1000 |
|
value: 0.98625 |
|
- type: recall_at_20 |
|
value: 0.86141 |
|
- type: recall_at_3 |
|
value: 0.61095 |
|
- type: recall_at_5 |
|
value: 0.71316 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (de) |
|
config: de |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.45574 |
|
- type: ndcg_at_10 |
|
value: 0.48123 |
|
- type: ndcg_at_100 |
|
value: 0.56049 |
|
- type: ndcg_at_1000 |
|
value: 0.57979 |
|
- type: ndcg_at_20 |
|
value: 0.51785 |
|
- type: ndcg_at_3 |
|
value: 0.41243 |
|
- type: ndcg_at_5 |
|
value: 0.4386 |
|
- type: recall_at_1 |
|
value: 0.20401 |
|
- type: recall_at_10 |
|
value: 0.58779 |
|
- type: recall_at_100 |
|
value: 0.8584 |
|
- type: recall_at_1000 |
|
value: 0.97364 |
|
- type: recall_at_20 |
|
value: 0.69061 |
|
- type: recall_at_3 |
|
value: 0.36573 |
|
- type: recall_at_5 |
|
value: 0.47495 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (es) |
|
config: es |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.5571 |
|
- type: ndcg_at_10 |
|
value: 0.49688 |
|
- type: ndcg_at_100 |
|
value: 0.60493 |
|
- type: ndcg_at_1000 |
|
value: 0.62922 |
|
- type: ndcg_at_20 |
|
value: 0.54438 |
|
- type: ndcg_at_3 |
|
value: 0.47981 |
|
- type: ndcg_at_5 |
|
value: 0.46584 |
|
- type: recall_at_1 |
|
value: 0.1638 |
|
- type: recall_at_10 |
|
value: 0.54155 |
|
- type: recall_at_100 |
|
value: 0.85136 |
|
- type: recall_at_1000 |
|
value: 0.96951 |
|
- type: recall_at_20 |
|
value: 0.65329 |
|
- type: recall_at_3 |
|
value: 0.31503 |
|
- type: recall_at_5 |
|
value: 0.40356 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (fa) |
|
config: fa |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.39873 |
|
- type: ndcg_at_10 |
|
value: 0.50226 |
|
- type: ndcg_at_100 |
|
value: 0.56517 |
|
- type: ndcg_at_1000 |
|
value: 0.57967 |
|
- type: ndcg_at_20 |
|
value: 0.5292 |
|
- type: ndcg_at_3 |
|
value: 0.42738 |
|
- type: ndcg_at_5 |
|
value: 0.45843 |
|
- type: recall_at_1 |
|
value: 0.25369 |
|
- type: recall_at_10 |
|
value: 0.63776 |
|
- type: recall_at_100 |
|
value: 0.87686 |
|
- type: recall_at_1000 |
|
value: 0.9671 |
|
- type: recall_at_20 |
|
value: 0.72099 |
|
- type: recall_at_3 |
|
value: 0.43808 |
|
- type: recall_at_5 |
|
value: 0.52378 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (fi) |
|
config: fi |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.60818 |
|
- type: ndcg_at_10 |
|
value: 0.6746 |
|
- type: ndcg_at_100 |
|
value: 0.71516 |
|
- type: ndcg_at_1000 |
|
value: 0.7218 |
|
- type: ndcg_at_20 |
|
value: 0.69692 |
|
- type: ndcg_at_3 |
|
value: 0.6006 |
|
- type: ndcg_at_5 |
|
value: 0.63842 |
|
- type: recall_at_1 |
|
value: 0.39264 |
|
- type: recall_at_10 |
|
value: 0.78577 |
|
- type: recall_at_100 |
|
value: 0.93291 |
|
- type: recall_at_1000 |
|
value: 0.97493 |
|
- type: recall_at_20 |
|
value: 0.85435 |
|
- type: recall_at_3 |
|
value: 0.61055 |
|
- type: recall_at_5 |
|
value: 0.69774 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (fr) |
|
config: fr |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.3965 |
|
- type: ndcg_at_10 |
|
value: 0.49891 |
|
- type: ndcg_at_100 |
|
value: 0.56492 |
|
- type: ndcg_at_1000 |
|
value: 0.57837 |
|
- type: ndcg_at_20 |
|
value: 0.53163 |
|
- type: ndcg_at_3 |
|
value: 0.39843 |
|
- type: ndcg_at_5 |
|
value: 0.44416 |
|
- type: recall_at_1 |
|
value: 0.22644 |
|
- type: recall_at_10 |
|
value: 0.65169 |
|
- type: recall_at_100 |
|
value: 0.89786 |
|
- type: recall_at_1000 |
|
value: 0.98081 |
|
- type: recall_at_20 |
|
value: 0.75338 |
|
- type: recall_at_3 |
|
value: 0.39798 |
|
- type: recall_at_5 |
|
value: 0.51001 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (hi) |
|
config: hi |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.36857 |
|
- type: ndcg_at_10 |
|
value: 0.46141 |
|
- type: ndcg_at_100 |
|
value: 0.52565 |
|
- type: ndcg_at_1000 |
|
value: 0.54319 |
|
- type: ndcg_at_20 |
|
value: 0.49384 |
|
- type: ndcg_at_3 |
|
value: 0.39469 |
|
- type: ndcg_at_5 |
|
value: 0.4184 |
|
- type: recall_at_1 |
|
value: 0.20185 |
|
- type: recall_at_10 |
|
value: 0.59474 |
|
- type: recall_at_100 |
|
value: 0.83385 |
|
- type: recall_at_1000 |
|
value: 0.94813 |
|
- type: recall_at_20 |
|
value: 0.69437 |
|
- type: recall_at_3 |
|
value: 0.38993 |
|
- type: recall_at_5 |
|
value: 0.47881 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (id) |
|
config: id |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.46354 |
|
- type: ndcg_at_10 |
|
value: 0.47229 |
|
- type: ndcg_at_100 |
|
value: 0.5525 |
|
- type: ndcg_at_1000 |
|
value: 0.57648 |
|
- type: ndcg_at_20 |
|
value: 0.50606 |
|
- type: ndcg_at_3 |
|
value: 0.42538 |
|
- type: ndcg_at_5 |
|
value: 0.43717 |
|
- type: recall_at_1 |
|
value: 0.20787 |
|
- type: recall_at_10 |
|
value: 0.54771 |
|
- type: recall_at_100 |
|
value: 0.80689 |
|
- type: recall_at_1000 |
|
value: 0.94032 |
|
- type: recall_at_20 |
|
value: 0.63842 |
|
- type: recall_at_3 |
|
value: 0.36229 |
|
- type: recall_at_5 |
|
value: 0.44437 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (ja) |
|
config: ja |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.56279 |
|
- type: ndcg_at_10 |
|
value: 0.6281 |
|
- type: ndcg_at_100 |
|
value: 0.67757 |
|
- type: ndcg_at_1000 |
|
value: 0.68667 |
|
- type: ndcg_at_20 |
|
value: 0.6521 |
|
- type: ndcg_at_3 |
|
value: 0.56226 |
|
- type: ndcg_at_5 |
|
value: 0.5866 |
|
- type: recall_at_1 |
|
value: 0.36648 |
|
- type: recall_at_10 |
|
value: 0.7496 |
|
- type: recall_at_100 |
|
value: 0.92461 |
|
- type: recall_at_1000 |
|
value: 0.97827 |
|
- type: recall_at_20 |
|
value: 0.82326 |
|
- type: recall_at_3 |
|
value: 0.55845 |
|
- type: recall_at_5 |
|
value: 0.63854 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (ko) |
|
config: ko |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.52582 |
|
- type: ndcg_at_10 |
|
value: 0.59216 |
|
- type: ndcg_at_100 |
|
value: 0.65093 |
|
- type: ndcg_at_1000 |
|
value: 0.66204 |
|
- type: ndcg_at_20 |
|
value: 0.62427 |
|
- type: ndcg_at_3 |
|
value: 0.5373 |
|
- type: ndcg_at_5 |
|
value: 0.55886 |
|
- type: recall_at_1 |
|
value: 0.30521 |
|
- type: recall_at_10 |
|
value: 0.71159 |
|
- type: recall_at_100 |
|
value: 0.90203 |
|
- type: recall_at_1000 |
|
value: 0.96714 |
|
- type: recall_at_20 |
|
value: 0.80209 |
|
- type: recall_at_3 |
|
value: 0.515 |
|
- type: recall_at_5 |
|
value: 0.6071 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (ru) |
|
config: ru |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.47524 |
|
- type: ndcg_at_10 |
|
value: 0.52349 |
|
- type: ndcg_at_100 |
|
value: 0.59725 |
|
- type: ndcg_at_1000 |
|
value: 0.61313 |
|
- type: ndcg_at_20 |
|
value: 0.55669 |
|
- type: ndcg_at_3 |
|
value: 0.46812 |
|
- type: ndcg_at_5 |
|
value: 0.48442 |
|
- type: recall_at_1 |
|
value: 0.24337 |
|
- type: recall_at_10 |
|
value: 0.62437 |
|
- type: recall_at_100 |
|
value: 0.86489 |
|
- type: recall_at_1000 |
|
value: 0.95266 |
|
- type: recall_at_20 |
|
value: 0.71411 |
|
- type: recall_at_3 |
|
value: 0.42927 |
|
- type: recall_at_5 |
|
value: 0.51258 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (sw) |
|
config: sw |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.5166 |
|
- type: ndcg_at_10 |
|
value: 0.61271 |
|
- type: ndcg_at_100 |
|
value: 0.66099 |
|
- type: ndcg_at_1000 |
|
value: 0.66867 |
|
- type: ndcg_at_20 |
|
value: 0.63643 |
|
- type: ndcg_at_3 |
|
value: 0.54828 |
|
- type: ndcg_at_5 |
|
value: 0.57382 |
|
- type: recall_at_1 |
|
value: 0.35277 |
|
- type: recall_at_10 |
|
value: 0.74368 |
|
- type: recall_at_100 |
|
value: 0.92261 |
|
- type: recall_at_1000 |
|
value: 0.97109 |
|
- type: recall_at_20 |
|
value: 0.81888 |
|
- type: recall_at_3 |
|
value: 0.56739 |
|
- type: recall_at_5 |
|
value: 0.6421 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (te) |
|
config: te |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.63768 |
|
- type: ndcg_at_10 |
|
value: 0.79193 |
|
- type: ndcg_at_100 |
|
value: 0.80243 |
|
- type: ndcg_at_1000 |
|
value: 0.80438 |
|
- type: ndcg_at_20 |
|
value: 0.79549 |
|
- type: ndcg_at_3 |
|
value: 0.76031 |
|
- type: ndcg_at_5 |
|
value: 0.77915 |
|
- type: recall_at_1 |
|
value: 0.63084 |
|
- type: recall_at_10 |
|
value: 0.92411 |
|
- type: recall_at_100 |
|
value: 0.97363 |
|
- type: recall_at_1000 |
|
value: 0.98833 |
|
- type: recall_at_20 |
|
value: 0.9374 |
|
- type: recall_at_3 |
|
value: 0.84159 |
|
- type: recall_at_5 |
|
value: 0.88627 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (th) |
|
config: th |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.66712 |
|
- type: ndcg_at_10 |
|
value: 0.73324 |
|
- type: ndcg_at_100 |
|
value: 0.76633 |
|
- type: ndcg_at_1000 |
|
value: 0.77119 |
|
- type: ndcg_at_20 |
|
value: 0.75243 |
|
- type: ndcg_at_3 |
|
value: 0.67393 |
|
- type: ndcg_at_5 |
|
value: 0.70201 |
|
- type: recall_at_1 |
|
value: 0.47106 |
|
- type: recall_at_10 |
|
value: 0.84294 |
|
- type: recall_at_100 |
|
value: 0.95949 |
|
- type: recall_at_1000 |
|
value: 0.98874 |
|
- type: recall_at_20 |
|
value: 0.90085 |
|
- type: recall_at_3 |
|
value: 0.68456 |
|
- type: recall_at_5 |
|
value: 0.75915 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (yo) |
|
config: yo |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.4958 |
|
- type: ndcg_at_10 |
|
value: 0.68705 |
|
- type: ndcg_at_100 |
|
value: 0.70664 |
|
- type: ndcg_at_1000 |
|
value: 0.71197 |
|
- type: ndcg_at_20 |
|
value: 0.698 |
|
- type: ndcg_at_3 |
|
value: 0.64793 |
|
- type: ndcg_at_5 |
|
value: 0.66709 |
|
- type: recall_at_1 |
|
value: 0.46289 |
|
- type: recall_at_10 |
|
value: 0.85154 |
|
- type: recall_at_100 |
|
value: 0.93557 |
|
- type: recall_at_1000 |
|
value: 0.97479 |
|
- type: recall_at_20 |
|
value: 0.89076 |
|
- type: recall_at_3 |
|
value: 0.7507 |
|
- type: recall_at_5 |
|
value: 0.79202 |
|
- dataset: |
|
type: miracl/mmteb-miracl |
|
name: Miracl (zh) |
|
config: zh |
|
split: dev |
|
task: |
|
type: Retrieval |
|
metrics: |
|
- type: ndcg_at_1 |
|
value: 0.47583 |
|
- type: ndcg_at_10 |
|
value: 0.52553 |
|
- type: ndcg_at_100 |
|
value: 0.6 |
|
- type: ndcg_at_1000 |
|
value: 0.61415 |
|
- type: ndcg_at_20 |
|
value: 0.55668 |
|
- type: ndcg_at_3 |
|
value: 0.45839 |
|
- type: ndcg_at_5 |
|
value: 0.48127 |
|
- type: recall_at_1 |
|
value: 0.24488 |
|
- type: recall_at_10 |
|
value: 0.63659 |
|
- type: recall_at_100 |
|
value: 0.89702 |
|
- type: recall_at_1000 |
|
value: 0.97996 |
|
- type: recall_at_20 |
|
value: 0.72652 |
|
- type: recall_at_3 |
|
value: 0.42827 |
|
- type: recall_at_5 |
|
value: 0.52081 |
|
pipeline_tag: sentence-similarity |
|
--- |
|
# Granite-Embedding-278m-multilingual |
|
|
|
**Model Summary:** |
|
Granite-Embedding-278M-Multilingual is a 278M parameter model from the Granite Embeddings suite that can be used to generate high quality text embeddings. This model produces embedding vectors of size 768 and is trained using a combination of open source relevance-pair datasets with permissive, enterprise-friendly license, and IBM collected and generated datasets. This model is developed using contrastive finetuning, knowledge distillation and model merging for improved performance. |
|
|
|
- **Developers:** Granite Embedding Team, IBM |
|
- **GitHub Repository:** [ibm-granite/granite-embedding-models](https://github.com/ibm-granite/granite-embedding-models) |
|
- **Website**: [Granite Docs](https://www.ibm.com/granite/docs/) |
|
- **Paper:** Coming Soon |
|
- **Release Date**: December 18th, 2024 |
|
- **License:** [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) |
|
|
|
**Supported Languages:** |
|
English, German, Spanish, French, Japanese, Portuguese, Arabic, Czech, Italian, Korean, Dutch, and Chinese. Users may finetune Granite-Embedding-278M-Multilingual for languages beyond these 12 languages. |
|
|
|
**Intended use:** |
|
The model is designed to produce fixed length vector representations for a given text, which can be used for text similarity, retrieval, and search applications. |
|
|
|
**Usage with Sentence Transformers:** |
|
The model is compatible with SentenceTransformer library and is very easy to use: |
|
|
|
First, install the sentence transformers library |
|
```shell |
|
pip install sentence_transformers |
|
``` |
|
|
|
The model can then be used to encode pairs of text and find the similarity between their representations |
|
|
|
```python |
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
model_path = "ibm-granite/granite-embedding-278m-multilingual" |
|
# Load the Sentence Transformer model |
|
model = SentenceTransformer(model_path) |
|
|
|
input_queries = [ |
|
' Who made the song My achy breaky heart? ', |
|
'summit define' |
|
] |
|
|
|
input_passages = [ |
|
"Achy Breaky Heart is a country song written by Don Von Tress. Originally titled Don't Tell My Heart and performed by The Marcy Brothers in 1991. ", |
|
"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments." |
|
] |
|
|
|
# encode queries and passages |
|
query_embeddings = model.encode(input_queries) |
|
passage_embeddings = model.encode(input_passages) |
|
|
|
# calculate cosine similarity |
|
print(util.cos_sim(query_embeddings, passage_embeddings)) |
|
``` |
|
|
|
**Usage with Huggingface Transformers:** |
|
This is a simple example of how to use the Granite-Embedding-278m-Multilingual model with the Transformers library and PyTorch. |
|
|
|
First, install the required libraries |
|
```shell |
|
pip install transformers torch |
|
``` |
|
|
|
The model can then be used to encode pairs of text |
|
|
|
```python |
|
import torch |
|
from transformers import AutoModel, AutoTokenizer |
|
|
|
model_path = "ibm-granite/granite-embedding-278m-multilingual" |
|
|
|
# Load the model and tokenizer |
|
model = AutoModel.from_pretrained(model_path) |
|
tokenizer = AutoTokenizer.from_pretrained(model_path) |
|
model.eval() |
|
|
|
input_queries = [ |
|
' Who made the song My achy breaky heart? ', |
|
'summit define' |
|
] |
|
|
|
# tokenize inputs |
|
tokenized_queries = tokenizer(input_queries, padding=True, truncation=True, return_tensors='pt') |
|
|
|
# encode queries |
|
with torch.no_grad(): |
|
# Queries |
|
model_output = model(**tokenized_queries) |
|
# Perform pooling. granite-embedding-278m-multilingual uses CLS Pooling |
|
query_embeddings = model_output[0][:, 0] |
|
|
|
# normalize the embeddings |
|
query_embeddings = torch.nn.functional.normalize(query_embeddings, dim=1) |
|
|
|
``` |
|
|
|
**Evaluation:** |
|
The average performance of the Granite-Embedding-278M-Multilingual on Multilingual Miracl (across 18 langauges), Mintaka Retrieval (across 8 languages) and MTEB Retrieval for English (across 15 tasks), German (across 4 tasks), Spanish (across 2 tasks), Frenc (across 5 tasks), Japanese (across 2 tasks), Arabic (1 task), Korean (1 task) and Chinese (across 8 tasks) is reported below. |
|
|
|
| Model | Paramters (M)| Embedding Dimension | Miracl (18) | Mintaka Retrieval (8) | MTEB English (15) | MTEB German (4) |MTEB Spanish (2) | MTEB French (5) | MTEB Japanese (2) | MTEB Arabic (1) | MTEB Korean (1) | MTEB Chinese (8) | |
|
|:-----------------------------------|:------------:|:-------------------:|:-------------:| :---------------------:|:-----------------:|:---------------:|:---------------:|:---------------:|:----------------:|:----------------:|:---------------:|:----------------:| |
|
|granite-embedding-278M-multilingual | 278 | 768 | 58.3 | 23.2 | 48.2 | 71.2 | 52.6 | 54.1 | 61.7 | 64.2 | 71.8 | 45.2 | |
|
|
|
**Model Architecture:** |
|
Granite-Embedding-278m-Multilingual is based on an encoder-only XLM-RoBERTa like transformer architecture, trained internally at IBM Research. |
|
|
|
| Model | granite-embedding-30m-english | granite-embedding-125m-english | granite-embedding-107M-multilingual | granite-embedding-278m-multilingual | |
|
| :-------- | :-------:| :-------: | :---------:| :-----:| |
|
| Embedding size | 384 | 768 | 384 | **768** | |
|
| Number of layers | 6 | 12 | 6 | **12** | |
|
| Number of attention heads | 12 | 12 | 12 | **12** | |
|
| Intermediate size | 1536 | 3072 | 1536 | **3072** | |
|
| Activation Function | GeLU | GeLU | GeLU | **GeLU** | |
|
| Vocabulary Size | 50265 | 50265 | 250002 | **250002** | |
|
| Max. Sequence Length | 512 | 512 | 512 | **512** | |
|
| # Parameters | 30M | 125M | 107M | **278M** | |
|
|
|
|
|
**Training Data:** |
|
Overall, the training data consists of four key sources: (1) unsupervised title-body paired data scraped from the web, (2) publicly available paired with permissive, enterprise-friendly license, (3) IBM-internal paired data targetting specific technical domains, and (4) IBM-generated synthetic data. The data is listed below: |
|
|
|
| **Dataset** | **Num. Pairs** | |
|
|:--------------------------------------------------------------------------|:--------------:| |
|
| Multilingual MC4 | 52,823,484 | |
|
| Multilingual Webhose | 12,369,322 | |
|
| English Wikipedia | 20,745,403 | |
|
| Multilingual Wikimedia | 2,911,090 | |
|
| Miracl Corpus (Title-Body) | 10,120,398 | |
|
| Stack Exchange Duplicate questions (titles) | 304,525 | |
|
| Stack Exchange Duplicate questions (titles) | 304,525 | |
|
| Stack Exchange Duplicate questions (bodies) | 250,519 | |
|
| Machine Translations of Stack Exchange Duplicate questions (titles) | 187,195 | |
|
| Stack Exchange (Title, Answer) pairs | 4,067,139 | |
|
| Stack Exchange (Title, Body) pairs | 23,978,013 | |
|
| Stack Exchange (Title, Body) pairs | 23,978,013 | |
|
| Machine Translations of Stack Exchange (Title+Body, Answer) pairs | 1,827,15 | |
|
| SearchQA | 582,261 | |
|
| S2ORC (Title, Abstract) | 41,769,185 | |
|
| WikiAnswers Duplicate question pairs | 77,427,422 | |
|
| CCNews | 614,664 | |
|
| XSum | 226,711 | |
|
| SimpleWiki | 102,225 | |
|
| Machine Translated Cross Lingual Parallel Corpora | 28,376,115 | |
|
| SPECTER citation triplets | 684,100 | |
|
| Machine Translations of SPECTER citation triplets | 4,104,600 | |
|
| Natural Questions (NQ) | 100,231 | |
|
| SQuAD2.0 | 87,599 | |
|
| HotpotQA | 85,000 | |
|
| Fever | 109,810 | |
|
| PubMed | 20,000,000 | |
|
| Multilingual Miracl Triples | 81,409 | |
|
| Multilingual MrTydi Triples | 48,715 | |
|
| Sadeeem Question Asnwering | 4,037 | |
|
| DBPedia Title-Body Pairs | 4,635,922 | |
|
| Synthetic: English Query-Wikipedia Passage | 1,879,093 | |
|
| Synthetic: English Fact Verification | 9,888 | |
|
| Synthetic: Multilingual Query-Wikipedia Passage | 300,266 | |
|
| Synthetic: Multilingual News Summaries | 37,489 | |
|
| IBM Internal Triples | 40,290 | |
|
| IBM Internal Title-Body Pairs | 1,524,586 | |
|
|
|
Notably, we do not use the popular MS-MARCO retrieval dataset in our training corpus due to its non-commercial license, while other open-source models train on this dataset due to its high quality. |
|
|
|
**Infrastructure:** |
|
We train Granite Embedding Models using IBM's computing cluster, Cognitive Compute Cluster, which is outfitted with NVIDIA A100 80gb GPUs. This cluster provides a scalable and efficient infrastructure for training our models over multiple GPUs. |
|
|
|
**Ethical Considerations and Limitations:** |
|
The data used to train the base language model was filtered to remove text containing hate, abuse, and profanity. Granite-Embedding-278m-Multilingual is trained only for English texts, and has a context length of 512 tokens (longer texts will be truncated to this size). |
|
|
|
**Resources** |
|
- ⭐️ Learn about the latest updates with Granite: https://www.ibm.com/granite |
|
- 📄 Get started with tutorials, best practices, and prompt engineering advice: https://www.ibm.com/granite/docs/ |
|
- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources |
|
|
|
<!-- ## Citation |
|
``` |
|
@misc{granite-embedding-models, |
|
author = {author 1, author2, ...}, |
|
title = {}, |
|
journal = {}, |
|
volume = {}, |
|
year = {2024}, |
|
url = {https://arxiv.org/abs/0000.00000}, |
|
} |
|
``` --> |