ibm-granite
/

granite-embedding-278m-multilingual

+---
+language:
+- en
+- ar
+- cs
+- de
+- es
+- fr
+- it
+- ja
+- ko
+- nl
+- pt
+- zh
+license: apache-2.0
+library_name: transformers
+tags:
+- language
+- granite
+- embeddings
+- multilingual
+model-index:
+- name: ibm-granite/granite-embedding-278m-multilingual
+  results:
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (en)
+      config: en
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.45557
+    - type: ndcg_at_10
+      value: 0.49372
+    - type: ndcg_at_100
+      value: 0.5728
+    - type: ndcg_at_1000
+      value: 0.59187
+    - type: ndcg_at_20
+      value: 0.52863
+    - type: ndcg_at_3
+      value: 0.43969
+    - type: ndcg_at_5
+      value: 0.45551
+    - type: recall_at_1
+      value: 0.21785
+    - type: recall_at_10
+      value: 0.59513
+    - type: recall_at_100
+      value: 0.85785
+    - type: recall_at_1000
+      value: 0.96041
+    - type: recall_at_20
+      value: 0.69357
+    - type: recall_at_3
+      value: 0.40403
+    - type: recall_at_5
+      value: 0.48499
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (ar)
+      config: ar
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.57459
+    - type: ndcg_at_10
+      value: 0.64238
+    - type: ndcg_at_100
+      value: 0.6867
+    - type: ndcg_at_1000
+      value: 0.6951
+    - type: ndcg_at_20
+      value: 0.66455
+    - type: ndcg_at_3
+      value: 0.58162
+    - type: ndcg_at_5
+      value: 0.60831
+    - type: recall_at_1
+      value: 0.38064
+    - type: recall_at_10
+      value: 0.75098
+    - type: recall_at_100
+      value: 0.91203
+    - type: recall_at_1000
+      value: 0.96706
+    - type: recall_at_20
+      value: 0.81978
+    - type: recall_at_3
+      value: 0.58618
+    - type: recall_at_5
+      value: 0.66353
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (bn)
+      config: bn
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.60341
+    - type: ndcg_at_10
+      value: 0.68055
+    - type: ndcg_at_100
+      value: 0.72008
+    - type: ndcg_at_1000
+      value: 0.72716
+    - type: ndcg_at_20
+      value: 0.69914
+    - type: ndcg_at_3
+      value: 0.60805
+    - type: ndcg_at_5
+      value: 0.64486
+    - type: recall_at_1
+      value: 0.37948
+    - type: recall_at_10
+      value: 0.80609
+    - type: recall_at_100
+      value: 0.94305
+    - type: recall_at_1000
+      value: 0.98625
+    - type: recall_at_20
+      value: 0.86141
+    - type: recall_at_3
+      value: 0.61095
+    - type: recall_at_5
+      value: 0.71316
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (de)
+      config: de
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.45574
+    - type: ndcg_at_10
+      value: 0.48123
+    - type: ndcg_at_100
+      value: 0.56049
+    - type: ndcg_at_1000
+      value: 0.57979
+    - type: ndcg_at_20
+      value: 0.51785
+    - type: ndcg_at_3
+      value: 0.41243
+    - type: ndcg_at_5
+      value: 0.4386
+    - type: recall_at_1
+      value: 0.20401
+    - type: recall_at_10
+      value: 0.58779
+    - type: recall_at_100
+      value: 0.8584
+    - type: recall_at_1000
+      value: 0.97364
+    - type: recall_at_20
+      value: 0.69061
+    - type: recall_at_3
+      value: 0.36573
+    - type: recall_at_5
+      value: 0.47495
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (es)
+      config: es
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.5571
+    - type: ndcg_at_10
+      value: 0.49688
+    - type: ndcg_at_100
+      value: 0.60493
+    - type: ndcg_at_1000
+      value: 0.62922
+    - type: ndcg_at_20
+      value: 0.54438
+    - type: ndcg_at_3
+      value: 0.47981
+    - type: ndcg_at_5
+      value: 0.46584
+    - type: recall_at_1
+      value: 0.1638
+    - type: recall_at_10
+      value: 0.54155
+    - type: recall_at_100
+      value: 0.85136
+    - type: recall_at_1000
+      value: 0.96951
+    - type: recall_at_20
+      value: 0.65329
+    - type: recall_at_3
+      value: 0.31503
+    - type: recall_at_5
+      value: 0.40356
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (fa)
+      config: fa
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.39873
+    - type: ndcg_at_10
+      value: 0.50226
+    - type: ndcg_at_100
+      value: 0.56517
+    - type: ndcg_at_1000
+      value: 0.57967
+    - type: ndcg_at_20
+      value: 0.5292
+    - type: ndcg_at_3
+      value: 0.42738
+    - type: ndcg_at_5
+      value: 0.45843
+    - type: recall_at_1
+      value: 0.25369
+    - type: recall_at_10
+      value: 0.63776
+    - type: recall_at_100
+      value: 0.87686
+    - type: recall_at_1000
+      value: 0.9671
+    - type: recall_at_20
+      value: 0.72099
+    - type: recall_at_3
+      value: 0.43808
+    - type: recall_at_5
+      value: 0.52378
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (fi)
+      config: fi
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.60818
+    - type: ndcg_at_10
+      value: 0.6746
+    - type: ndcg_at_100
+      value: 0.71516
+    - type: ndcg_at_1000
+      value: 0.7218
+    - type: ndcg_at_20
+      value: 0.69692
+    - type: ndcg_at_3
+      value: 0.6006
+    - type: ndcg_at_5
+      value: 0.63842
+    - type: recall_at_1
+      value: 0.39264
+    - type: recall_at_10
+      value: 0.78577
+    - type: recall_at_100
+      value: 0.93291
+    - type: recall_at_1000
+      value: 0.97493
+    - type: recall_at_20
+      value: 0.85435
+    - type: recall_at_3
+      value: 0.61055
+    - type: recall_at_5
+      value: 0.69774
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (fr)
+      config: fr
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.3965
+    - type: ndcg_at_10
+      value: 0.49891
+    - type: ndcg_at_100
+      value: 0.56492
+    - type: ndcg_at_1000
+      value: 0.57837
+    - type: ndcg_at_20
+      value: 0.53163
+    - type: ndcg_at_3
+      value: 0.39843
+    - type: ndcg_at_5
+      value: 0.44416
+    - type: recall_at_1
+      value: 0.22644
+    - type: recall_at_10
+      value: 0.65169
+    - type: recall_at_100
+      value: 0.89786
+    - type: recall_at_1000
+      value: 0.98081
+    - type: recall_at_20
+      value: 0.75338
+    - type: recall_at_3
+      value: 0.39798
+    - type: recall_at_5
+      value: 0.51001
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (hi)
+      config: hi
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.36857
+    - type: ndcg_at_10
+      value: 0.46141
+    - type: ndcg_at_100
+      value: 0.52565
+    - type: ndcg_at_1000
+      value: 0.54319
+    - type: ndcg_at_20
+      value: 0.49384
+    - type: ndcg_at_3
+      value: 0.39469
+    - type: ndcg_at_5
+      value: 0.4184
+    - type: recall_at_1
+      value: 0.20185
+    - type: recall_at_10
+      value: 0.59474
+    - type: recall_at_100
+      value: 0.83385
+    - type: recall_at_1000
+      value: 0.94813
+    - type: recall_at_20
+      value: 0.69437
+    - type: recall_at_3
+      value: 0.38993
+    - type: recall_at_5
+      value: 0.47881
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (id)
+      config: id
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.46354
+    - type: ndcg_at_10
+      value: 0.47229
+    - type: ndcg_at_100
+      value: 0.5525
+    - type: ndcg_at_1000
+      value: 0.57648
+    - type: ndcg_at_20
+      value: 0.50606
+    - type: ndcg_at_3
+      value: 0.42538
+    - type: ndcg_at_5
+      value: 0.43717
+    - type: recall_at_1
+      value: 0.20787
+    - type: recall_at_10
+      value: 0.54771
+    - type: recall_at_100
+      value: 0.80689
+    - type: recall_at_1000
+      value: 0.94032
+    - type: recall_at_20
+      value: 0.63842
+    - type: recall_at_3
+      value: 0.36229
+    - type: recall_at_5
+      value: 0.44437
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (ja)
+      config: ja
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.56279
+    - type: ndcg_at_10
+      value: 0.6281
+    - type: ndcg_at_100
+      value: 0.67757
+    - type: ndcg_at_1000
+      value: 0.68667
+    - type: ndcg_at_20
+      value: 0.6521
+    - type: ndcg_at_3
+      value: 0.56226
+    - type: ndcg_at_5
+      value: 0.5866
+    - type: recall_at_1
+      value: 0.36648
+    - type: recall_at_10
+      value: 0.7496
+    - type: recall_at_100
+      value: 0.92461
+    - type: recall_at_1000
+      value: 0.97827
+    - type: recall_at_20
+      value: 0.82326
+    - type: recall_at_3
+      value: 0.55845
+    - type: recall_at_5
+      value: 0.63854
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (ko)
+      config: ko
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.52582
+    - type: ndcg_at_10
+      value: 0.59216
+    - type: ndcg_at_100
+      value: 0.65093
+    - type: ndcg_at_1000
+      value: 0.66204
+    - type: ndcg_at_20
+      value: 0.62427
+    - type: ndcg_at_3
+      value: 0.5373
+    - type: ndcg_at_5
+      value: 0.55886
+    - type: recall_at_1
+      value: 0.30521
+    - type: recall_at_10
+      value: 0.71159
+    - type: recall_at_100
+      value: 0.90203
+    - type: recall_at_1000
+      value: 0.96714
+    - type: recall_at_20
+      value: 0.80209
+    - type: recall_at_3
+      value: 0.515
+    - type: recall_at_5
+      value: 0.6071
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (ru)
+      config: ru
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.47524
+    - type: ndcg_at_10
+      value: 0.52349
+    - type: ndcg_at_100
+      value: 0.59725
+    - type: ndcg_at_1000
+      value: 0.61313
+    - type: ndcg_at_20
+      value: 0.55669
+    - type: ndcg_at_3
+      value: 0.46812
+    - type: ndcg_at_5
+      value: 0.48442
+    - type: recall_at_1
+      value: 0.24337
+    - type: recall_at_10
+      value: 0.62437
+    - type: recall_at_100
+      value: 0.86489
+    - type: recall_at_1000
+      value: 0.95266
+    - type: recall_at_20
+      value: 0.71411
+    - type: recall_at_3
+      value: 0.42927
+    - type: recall_at_5
+      value: 0.51258
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (sw)
+      config: sw
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.5166
+    - type: ndcg_at_10
+      value: 0.61271
+    - type: ndcg_at_100
+      value: 0.66099
+    - type: ndcg_at_1000
+      value: 0.66867
+    - type: ndcg_at_20
+      value: 0.63643
+    - type: ndcg_at_3
+      value: 0.54828
+    - type: ndcg_at_5
+      value: 0.57382
+    - type: recall_at_1
+      value: 0.35277
+    - type: recall_at_10
+      value: 0.74368
+    - type: recall_at_100
+      value: 0.92261
+    - type: recall_at_1000
+      value: 0.97109
+    - type: recall_at_20
+      value: 0.81888
+    - type: recall_at_3
+      value: 0.56739
+    - type: recall_at_5
+      value: 0.6421
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (te)
+      config: te
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.63768
+    - type: ndcg_at_10
+      value: 0.79193
+    - type: ndcg_at_100
+      value: 0.80243
+    - type: ndcg_at_1000
+      value: 0.80438
+    - type: ndcg_at_20
+      value: 0.79549
+    - type: ndcg_at_3
+      value: 0.76031
+    - type: ndcg_at_5
+      value: 0.77915
+    - type: recall_at_1
+      value: 0.63084
+    - type: recall_at_10
+      value: 0.92411
+    - type: recall_at_100
+      value: 0.97363
+    - type: recall_at_1000
+      value: 0.98833
+    - type: recall_at_20
+      value: 0.9374
+    - type: recall_at_3
+      value: 0.84159
+    - type: recall_at_5
+      value: 0.88627
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (th)
+      config: th
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.66712
+    - type: ndcg_at_10
+      value: 0.73324
+    - type: ndcg_at_100
+      value: 0.76633
+    - type: ndcg_at_1000
+      value: 0.77119
+    - type: ndcg_at_20
+      value: 0.75243
+    - type: ndcg_at_3
+      value: 0.67393
+    - type: ndcg_at_5
+      value: 0.70201
+    - type: recall_at_1
+      value: 0.47106
+    - type: recall_at_10
+      value: 0.84294
+    - type: recall_at_100
+      value: 0.95949
+    - type: recall_at_1000
+      value: 0.98874
+    - type: recall_at_20
+      value: 0.90085
+    - type: recall_at_3
+      value: 0.68456
+    - type: recall_at_5
+      value: 0.75915
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (yo)
+      config: yo
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.4958
+    - type: ndcg_at_10
+      value: 0.68705
+    - type: ndcg_at_100
+      value: 0.70664
+    - type: ndcg_at_1000
+      value: 0.71197
+    - type: ndcg_at_20
+      value: 0.698
+    - type: ndcg_at_3
+      value: 0.64793
+    - type: ndcg_at_5
+      value: 0.66709
+    - type: recall_at_1
+      value: 0.46289
+    - type: recall_at_10
+      value: 0.85154
+    - type: recall_at_100
+      value: 0.93557
+    - type: recall_at_1000
+      value: 0.97479
+    - type: recall_at_20
+      value: 0.89076
+    - type: recall_at_3
+      value: 0.7507
+    - type: recall_at_5
+      value: 0.79202
+  - dataset:
+      type: miracl/mmteb-miracl
+      name: Miracl (zh)
+      config: zh
+      split: dev
+    task:
+      type: Retrieval
+    metrics:
+    - type: ndcg_at_1
+      value: 0.47583
+    - type: ndcg_at_10
+      value: 0.52553
+    - type: ndcg_at_100
+      value: 0.6
+    - type: ndcg_at_1000
+      value: 0.61415
+    - type: ndcg_at_20
+      value: 0.55668
+    - type: ndcg_at_3
+      value: 0.45839
+    - type: ndcg_at_5
+      value: 0.48127
+    - type: recall_at_1
+      value: 0.24488
+    - type: recall_at_10
+      value: 0.63659
+    - type: recall_at_100
+      value: 0.89702
+    - type: recall_at_1000
+      value: 0.97996
+    - type: recall_at_20
+      value: 0.72652
+    - type: recall_at_3
+      value: 0.42827
+    - type: recall_at_5
+      value: 0.52081
+---
+# Granite-Embedding-278m-multilingual
+**Model Summary:**
+Granite-Embedding-278M-Multilingual is a 278M parameter model from the Granite Embeddings suite that can be used to generate high quality text embeddings. This model produces embedding vectors of size 768 and is trained using a combination of open source relevance-pair datasets with permissive, enterprise-friendly license, and IBM collected and generated datasets. This model is developed using contrastive finetuning, knowledge distillation and model merging for improved performance.
+- **Developers:** Granite Embedding Team, IBM
+- **GitHub Repository:** [ibm-granite/granite-embedding-models](https://github.com/ibm-granite/granite-embedding-models)
+- **Website**: [Granite Docs](https://www.ibm.com/granite/docs/)
+- **Paper:** Coming Soon
+- **Release Date**: December 18th, 2024
+- **License:** [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)
+**Supported Languages:**
+English, German, Spanish, French, Japanese, Portuguese, Arabic, Czech, Italian, Korean, Dutch, and Chinese. Users may finetune Granite-Embedding-278M-Multilingual for languages beyond these 12 languages.
+**Intended use:**
+The model is designed to produce fixed length vector representations for a given text, which can be used for text similarity, retrieval, and search applications.
+**Usage with Sentence Transformers:**
+The model is compatible with SentenceTransformer library and is very easy to use:
+First, install the sentence transformers library
+```shell
+pip install sentence_transformers
+```
+The model can then be used to encode pairs of text and find the similarity between their representations
+```python
+from sentence_transformers import SentenceTransformer, util
+model_path = "ibm-granite/granite-embedding-278m-multilingual"
+# Load the Sentence Transformer model
+model = SentenceTransformer(model_path)
+input_queries = [
+    ' Who made the song My achy breaky heart? ',
+    'summit define'
+    ]
+input_passages = [
+    "Achy Breaky Heart is a country song written by Don Von Tress. Originally titled Don't Tell My Heart and performed by The Marcy Brothers in 1991. ",
+    "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments."
+    ]
+# encode queries and passages
+query_embeddings = model.encode(input_queries)
+passage_embeddings = model.encode(input_passages)
+# calculate cosine similarity
+print(util.cos_sim(query_embeddings, passage_embeddings))
+```
+**Usage with Huggingface Transformers:**
+This is a simple example of how to use the Granite-Embedding-278m-Multilingual model with the Transformers library and PyTorch.
+First, install the required libraries
+```shell
+pip install transformers torch
+```
+The model can then be used to encode pairs of text
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+model_path = "ibm-granite/granite-embedding-278m-multilingual"
+# Load the model and tokenizer
+model = AutoModel.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+model.eval()
+input_queries = [
+    ' Who made the song My achy breaky heart? ',
+    'summit define'
+    ]
+# tokenize inputs
+tokenized_queries = tokenizer(input_queries, padding=True, truncation=True, return_tensors='pt')
+# encode queries
+with torch.no_grad():
+    # Queries
+    model_output = model(**tokenized_queries)
+    # Perform pooling. granite-embedding-278m-multilingual uses CLS Pooling
+    query_embeddings = model_output[0][:, 0]
+# normalize the embeddings
+query_embeddings = torch.nn.functional.normalize(query_embeddings, dim=1)
+```
+**Evaluation:**
+The average performance of the Granite-Embedding-278M-Multilingual on Multilingual Miracl (across 18 langauges), Mintaka Retrieval (across 8 languages) and MTEB Retrieval for English (across 15 tasks), German (across 4 tasks), Spanish (across 2 tasks), Frenc (across 5 tasks), Japanese (across 2 tasks), Arabic (1 task), Korean (1 task) and Chinese (across 8 tasks) is reported below.
+| Model                              | Paramters (M)| Embedding Dimension | Miracl (18)   |  Mintaka Retrieval (8) | MTEB English (15) | MTEB German (4) |MTEB Spanish (2) | MTEB French (5) | MTEB Japanese (2) |  MTEB Arabic (1) | MTEB Korean (1) | MTEB Chinese (8) |
+|:-----------------------------------|:------------:|:-------------------:|:-------------:| :---------------------:|:-----------------:|:---------------:|:---------------:|:---------------:|:----------------:|:----------------:|:---------------:|:----------------:|
+|granite-embedding-278M-multilingual | 278 | 768 | 58.3 | 23.2 | 48.2 | 71.2 | 52.6 | 54.1 | 61.7 | 64.2 | 71.8 | 45.2 |
+**Model Architecture:**
+Granite-Embedding-278m-Multilingual is based on an encoder-only XLM-RoBERTa like transformer architecture, trained internally at IBM Research.
+| Model                     | granite-embedding-30m-english | granite-embedding-125m-english    | granite-embedding-107M-multilingual | granite-embedding-278m-multilingual |
+| :--------                 | :-------:| :-------:    | :---------:| :-----:|
+| Embedding size            | 384      | 768          | 384    | **768**    |
+| Number of layers          | 6        | 12           | 6      | **12**     |
+| Number of attention heads | 12       | 12           | 12     | **12**     |
+| Intermediate size         | 1536     | 3072         | 1536   | **3072**   |
+| Activation Function       | GeLU     | GeLU         | GeLU   | **GeLU**   |
+| Vocabulary Size           | 50265    | 50265        | 250002 | **250002** |
+| Max. Sequence Length      | 512      | 512          | 512    | **512**    |
+| # Parameters              | 30M      | 125M         | 107M   | **278M**   |
+**Training Data:**
+Overall, the training data consists of four key sources: (1) unsupervised title-body paired data scraped from the web, (2) publicly available paired with permissive, enterprise-friendly license, (3) IBM-internal paired data targetting specific technical domains, and (4) IBM-generated synthetic data. The data is listed below:
+| **Dataset**                                                               | **Num. Pairs** |
+|:--------------------------------------------------------------------------|:--------------:|
+| Multilingual MC4                                                          | 52,823,484     |
+| Multilingual Webhose                                                      | 12,369,322     |
+| English Wikipedia                                                         | 20,745,403     |
+| Multilingual Wikimedia                                                    | 2,911,090      |
+| Miracl Corpus (Title-Body)                                                | 10,120,398     |
+| Stack Exchange Duplicate questions (titles)                               | 304,525        |
+| Stack Exchange Duplicate questions (titles)                               | 304,525        |
+| Stack Exchange Duplicate questions (bodies)                               | 250,519        |
+| Machine Translations of Stack Exchange Duplicate questions (titles)       | 187,195        |
+| Stack Exchange (Title, Answer) pairs                                      | 4,067,139      |
+| Stack Exchange (Title, Body) pairs                                        | 23,978,013     |
+| Stack Exchange (Title, Body) pairs                                        | 23,978,013     |
+| Machine Translations of Stack Exchange (Title+Body, Answer) pairs         | 1,827,15       |
+| SearchQA                                                                  | 582,261        |
+| S2ORC (Title, Abstract)                                                   | 41,769,185     |
+| WikiAnswers Duplicate question pairs                                      | 77,427,422     |
+| CCNews                                                                    | 614,664        |
+| XSum                                                                      | 226,711        |
+| SimpleWiki                                                                | 102,225        |
+| Machine Translated Cross Lingual Parallel Corpora                         | 28,376,115     |
+| SPECTER citation triplets                                                 | 684,100        |
+| Machine Translations of SPECTER citation triplets                         | 4,104,600      |
+| Natural Questions (NQ)                                                    | 100,231        |
+| SQuAD2.0                                                                  | 87,599         |
+| HotpotQA                                                                  | 85,000         |
+| Fever                                                                     | 109,810        |
+| PubMed                                                                    | 20,000,000     |
+| Multilingual Miracl Triples                                               | 81,409         |
+| Multilingual MrTydi Triples                                               | 48,715         |
+| Sadeeem Question Asnwering                                                | 4,037          |
+| DBPedia Title-Body Pairs                                                  | 4,635,922      |
+| Synthetic: English Query-Wikipedia Passage                                | 1,879,093      |
+| Synthetic: English Fact Verification                                      | 9,888          |
+| Synthetic: Multilingual Query-Wikipedia Passage                           | 300,266        |
+| Synthetic: Multilingual News Summaries                                    | 37,489         |
+| IBM Internal Triples                                                      | 40,290         |
+| IBM Internal Title-Body Pairs                                             | 1,524,586      |
+Notably, we do not use the popular MS-MARCO retrieval dataset in our training corpus due to its non-commercial license, while other open-source models train on this dataset due to its high quality.
+**Infrastructure:**
+We train Granite Embedding Models using IBM's computing cluster, Cognitive Compute Cluster, which is outfitted with NVIDIA A100 80gb GPUs. This cluster provides a scalable and efficient infrastructure for training our models over multiple GPUs.
+**Ethical Considerations and Limitations:**
+The data used to train the base language model was filtered to remove text containing hate, abuse, and profanity. Granite-Embedding-278m-Multilingual is trained only for English texts, and has a context length of 512 tokens (longer texts will be truncated to this size).
+<!-- ## Citation
+```
+@misc{granite-embedding-models,
+  author = {author 1, author2, ...},
+  title = {},
+  journal = {},
+  volume = {},
+  year = {2024},
+  url = {https://arxiv.org/abs/0000.00000},
+}
+``` -->