Update README.md

f477270 verified 13 days ago

25.5 kB

	---
	language:
	- en
	- ar
	- cs
	- de
	- es
	- fr
	- it
	- ja
	- ko
	- nl
	- pt
	- zh
	license: apache-2.0
	library_name: transformers
	tags:
	- language
	- granite
	- embeddings
	- multilingual
	model-index:
	- name: ibm-granite/granite-embedding-278m-multilingual
	results:
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (en)
	config: en
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.45557
	- type: ndcg_at_10
	value: 0.49372
	- type: ndcg_at_100
	value: 0.5728
	- type: ndcg_at_1000
	value: 0.59187
	- type: ndcg_at_20
	value: 0.52863
	- type: ndcg_at_3
	value: 0.43969
	- type: ndcg_at_5
	value: 0.45551
	- type: recall_at_1
	value: 0.21785
	- type: recall_at_10
	value: 0.59513
	- type: recall_at_100
	value: 0.85785
	- type: recall_at_1000
	value: 0.96041
	- type: recall_at_20
	value: 0.69357
	- type: recall_at_3
	value: 0.40403
	- type: recall_at_5
	value: 0.48499
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (ar)
	config: ar
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.57459
	- type: ndcg_at_10
	value: 0.64238
	- type: ndcg_at_100
	value: 0.6867
	- type: ndcg_at_1000
	value: 0.6951
	- type: ndcg_at_20
	value: 0.66455
	- type: ndcg_at_3
	value: 0.58162
	- type: ndcg_at_5
	value: 0.60831
	- type: recall_at_1
	value: 0.38064
	- type: recall_at_10
	value: 0.75098
	- type: recall_at_100
	value: 0.91203
	- type: recall_at_1000
	value: 0.96706
	- type: recall_at_20
	value: 0.81978
	- type: recall_at_3
	value: 0.58618
	- type: recall_at_5
	value: 0.66353
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (bn)
	config: bn
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.60341
	- type: ndcg_at_10
	value: 0.68055
	- type: ndcg_at_100
	value: 0.72008
	- type: ndcg_at_1000
	value: 0.72716
	- type: ndcg_at_20
	value: 0.69914
	- type: ndcg_at_3
	value: 0.60805
	- type: ndcg_at_5
	value: 0.64486
	- type: recall_at_1
	value: 0.37948
	- type: recall_at_10
	value: 0.80609
	- type: recall_at_100
	value: 0.94305
	- type: recall_at_1000
	value: 0.98625
	- type: recall_at_20
	value: 0.86141
	- type: recall_at_3
	value: 0.61095
	- type: recall_at_5
	value: 0.71316
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (de)
	config: de
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.45574
	- type: ndcg_at_10
	value: 0.48123
	- type: ndcg_at_100
	value: 0.56049
	- type: ndcg_at_1000
	value: 0.57979
	- type: ndcg_at_20
	value: 0.51785
	- type: ndcg_at_3
	value: 0.41243
	- type: ndcg_at_5
	value: 0.4386
	- type: recall_at_1
	value: 0.20401
	- type: recall_at_10
	value: 0.58779
	- type: recall_at_100
	value: 0.8584
	- type: recall_at_1000
	value: 0.97364
	- type: recall_at_20
	value: 0.69061
	- type: recall_at_3
	value: 0.36573
	- type: recall_at_5
	value: 0.47495
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (es)
	config: es
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.5571
	- type: ndcg_at_10
	value: 0.49688
	- type: ndcg_at_100
	value: 0.60493
	- type: ndcg_at_1000
	value: 0.62922
	- type: ndcg_at_20
	value: 0.54438
	- type: ndcg_at_3
	value: 0.47981
	- type: ndcg_at_5
	value: 0.46584
	- type: recall_at_1
	value: 0.1638
	- type: recall_at_10
	value: 0.54155
	- type: recall_at_100
	value: 0.85136
	- type: recall_at_1000
	value: 0.96951
	- type: recall_at_20
	value: 0.65329
	- type: recall_at_3
	value: 0.31503
	- type: recall_at_5
	value: 0.40356
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (fa)
	config: fa
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.39873
	- type: ndcg_at_10
	value: 0.50226
	- type: ndcg_at_100
	value: 0.56517
	- type: ndcg_at_1000
	value: 0.57967
	- type: ndcg_at_20
	value: 0.5292
	- type: ndcg_at_3
	value: 0.42738
	- type: ndcg_at_5
	value: 0.45843
	- type: recall_at_1
	value: 0.25369
	- type: recall_at_10
	value: 0.63776
	- type: recall_at_100
	value: 0.87686
	- type: recall_at_1000
	value: 0.9671
	- type: recall_at_20
	value: 0.72099
	- type: recall_at_3
	value: 0.43808
	- type: recall_at_5
	value: 0.52378
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (fi)
	config: fi
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.60818
	- type: ndcg_at_10
	value: 0.6746
	- type: ndcg_at_100
	value: 0.71516
	- type: ndcg_at_1000
	value: 0.7218
	- type: ndcg_at_20
	value: 0.69692
	- type: ndcg_at_3
	value: 0.6006
	- type: ndcg_at_5
	value: 0.63842
	- type: recall_at_1
	value: 0.39264
	- type: recall_at_10
	value: 0.78577
	- type: recall_at_100
	value: 0.93291
	- type: recall_at_1000
	value: 0.97493
	- type: recall_at_20
	value: 0.85435
	- type: recall_at_3
	value: 0.61055
	- type: recall_at_5
	value: 0.69774
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (fr)
	config: fr
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.3965
	- type: ndcg_at_10
	value: 0.49891
	- type: ndcg_at_100
	value: 0.56492
	- type: ndcg_at_1000
	value: 0.57837
	- type: ndcg_at_20
	value: 0.53163
	- type: ndcg_at_3
	value: 0.39843
	- type: ndcg_at_5
	value: 0.44416
	- type: recall_at_1
	value: 0.22644
	- type: recall_at_10
	value: 0.65169
	- type: recall_at_100
	value: 0.89786
	- type: recall_at_1000
	value: 0.98081
	- type: recall_at_20
	value: 0.75338
	- type: recall_at_3
	value: 0.39798
	- type: recall_at_5
	value: 0.51001
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (hi)
	config: hi
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.36857
	- type: ndcg_at_10
	value: 0.46141
	- type: ndcg_at_100
	value: 0.52565
	- type: ndcg_at_1000
	value: 0.54319
	- type: ndcg_at_20
	value: 0.49384
	- type: ndcg_at_3
	value: 0.39469
	- type: ndcg_at_5
	value: 0.4184
	- type: recall_at_1
	value: 0.20185
	- type: recall_at_10
	value: 0.59474
	- type: recall_at_100
	value: 0.83385
	- type: recall_at_1000
	value: 0.94813
	- type: recall_at_20
	value: 0.69437
	- type: recall_at_3
	value: 0.38993
	- type: recall_at_5
	value: 0.47881
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (id)
	config: id
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.46354
	- type: ndcg_at_10
	value: 0.47229
	- type: ndcg_at_100
	value: 0.5525
	- type: ndcg_at_1000
	value: 0.57648
	- type: ndcg_at_20
	value: 0.50606
	- type: ndcg_at_3
	value: 0.42538
	- type: ndcg_at_5
	value: 0.43717
	- type: recall_at_1
	value: 0.20787
	- type: recall_at_10
	value: 0.54771
	- type: recall_at_100
	value: 0.80689
	- type: recall_at_1000
	value: 0.94032
	- type: recall_at_20
	value: 0.63842
	- type: recall_at_3
	value: 0.36229
	- type: recall_at_5
	value: 0.44437
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (ja)
	config: ja
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.56279
	- type: ndcg_at_10
	value: 0.6281
	- type: ndcg_at_100
	value: 0.67757
	- type: ndcg_at_1000
	value: 0.68667
	- type: ndcg_at_20
	value: 0.6521
	- type: ndcg_at_3
	value: 0.56226
	- type: ndcg_at_5
	value: 0.5866
	- type: recall_at_1
	value: 0.36648
	- type: recall_at_10
	value: 0.7496
	- type: recall_at_100
	value: 0.92461
	- type: recall_at_1000
	value: 0.97827
	- type: recall_at_20
	value: 0.82326
	- type: recall_at_3
	value: 0.55845
	- type: recall_at_5
	value: 0.63854
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (ko)
	config: ko
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.52582
	- type: ndcg_at_10
	value: 0.59216
	- type: ndcg_at_100
	value: 0.65093
	- type: ndcg_at_1000
	value: 0.66204
	- type: ndcg_at_20
	value: 0.62427
	- type: ndcg_at_3
	value: 0.5373
	- type: ndcg_at_5
	value: 0.55886
	- type: recall_at_1
	value: 0.30521
	- type: recall_at_10
	value: 0.71159
	- type: recall_at_100
	value: 0.90203
	- type: recall_at_1000
	value: 0.96714
	- type: recall_at_20
	value: 0.80209
	- type: recall_at_3
	value: 0.515
	- type: recall_at_5
	value: 0.6071
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (ru)
	config: ru
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.47524
	- type: ndcg_at_10
	value: 0.52349
	- type: ndcg_at_100
	value: 0.59725
	- type: ndcg_at_1000
	value: 0.61313
	- type: ndcg_at_20
	value: 0.55669
	- type: ndcg_at_3
	value: 0.46812
	- type: ndcg_at_5
	value: 0.48442
	- type: recall_at_1
	value: 0.24337
	- type: recall_at_10
	value: 0.62437
	- type: recall_at_100
	value: 0.86489
	- type: recall_at_1000
	value: 0.95266
	- type: recall_at_20
	value: 0.71411
	- type: recall_at_3
	value: 0.42927
	- type: recall_at_5
	value: 0.51258
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (sw)
	config: sw
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.5166
	- type: ndcg_at_10
	value: 0.61271
	- type: ndcg_at_100
	value: 0.66099
	- type: ndcg_at_1000
	value: 0.66867
	- type: ndcg_at_20
	value: 0.63643
	- type: ndcg_at_3
	value: 0.54828
	- type: ndcg_at_5
	value: 0.57382
	- type: recall_at_1
	value: 0.35277
	- type: recall_at_10
	value: 0.74368
	- type: recall_at_100
	value: 0.92261
	- type: recall_at_1000
	value: 0.97109
	- type: recall_at_20
	value: 0.81888
	- type: recall_at_3
	value: 0.56739
	- type: recall_at_5
	value: 0.6421
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (te)
	config: te
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.63768
	- type: ndcg_at_10
	value: 0.79193
	- type: ndcg_at_100
	value: 0.80243
	- type: ndcg_at_1000
	value: 0.80438
	- type: ndcg_at_20
	value: 0.79549
	- type: ndcg_at_3
	value: 0.76031
	- type: ndcg_at_5
	value: 0.77915
	- type: recall_at_1
	value: 0.63084
	- type: recall_at_10
	value: 0.92411
	- type: recall_at_100
	value: 0.97363
	- type: recall_at_1000
	value: 0.98833
	- type: recall_at_20
	value: 0.9374
	- type: recall_at_3
	value: 0.84159
	- type: recall_at_5
	value: 0.88627
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (th)
	config: th
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.66712
	- type: ndcg_at_10
	value: 0.73324
	- type: ndcg_at_100
	value: 0.76633
	- type: ndcg_at_1000
	value: 0.77119
	- type: ndcg_at_20
	value: 0.75243
	- type: ndcg_at_3
	value: 0.67393
	- type: ndcg_at_5
	value: 0.70201
	- type: recall_at_1
	value: 0.47106
	- type: recall_at_10
	value: 0.84294
	- type: recall_at_100
	value: 0.95949
	- type: recall_at_1000
	value: 0.98874
	- type: recall_at_20
	value: 0.90085
	- type: recall_at_3
	value: 0.68456
	- type: recall_at_5
	value: 0.75915
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (yo)
	config: yo
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.4958
	- type: ndcg_at_10
	value: 0.68705
	- type: ndcg_at_100
	value: 0.70664
	- type: ndcg_at_1000
	value: 0.71197
	- type: ndcg_at_20
	value: 0.698
	- type: ndcg_at_3
	value: 0.64793
	- type: ndcg_at_5
	value: 0.66709
	- type: recall_at_1
	value: 0.46289
	- type: recall_at_10
	value: 0.85154
	- type: recall_at_100
	value: 0.93557
	- type: recall_at_1000
	value: 0.97479
	- type: recall_at_20
	value: 0.89076
	- type: recall_at_3
	value: 0.7507
	- type: recall_at_5
	value: 0.79202
	- dataset:
	type: miracl/mmteb-miracl
	name: Miracl (zh)
	config: zh
	split: dev
	task:
	type: Retrieval
	metrics:
	- type: ndcg_at_1
	value: 0.47583
	- type: ndcg_at_10
	value: 0.52553
	- type: ndcg_at_100
	value: 0.6
	- type: ndcg_at_1000
	value: 0.61415
	- type: ndcg_at_20
	value: 0.55668
	- type: ndcg_at_3
	value: 0.45839
	- type: ndcg_at_5
	value: 0.48127
	- type: recall_at_1
	value: 0.24488
	- type: recall_at_10
	value: 0.63659
	- type: recall_at_100
	value: 0.89702
	- type: recall_at_1000
	value: 0.97996
	- type: recall_at_20
	value: 0.72652
	- type: recall_at_3
	value: 0.42827
	- type: recall_at_5
	value: 0.52081
	pipeline_tag: sentence-similarity
	---
	# Granite-Embedding-278m-multilingual

	Model Summary:
	Granite-Embedding-278M-Multilingual is a 278M parameter model from the Granite Embeddings suite that can be used to generate high quality text embeddings. This model produces embedding vectors of size 768 and is trained using a combination of open source relevance-pair datasets with permissive, enterprise-friendly license, and IBM collected and generated datasets. This model is developed using contrastive finetuning, knowledge distillation and model merging for improved performance.

	- Developers: Granite Embedding Team, IBM
	- GitHub Repository: [ibm-granite/granite-embedding-models](https://github.com/ibm-granite/granite-embedding-models)
	- Website: [Granite Docs](https://www.ibm.com/granite/docs/)
	- Paper: Coming Soon
	- Release Date: December 18th, 2024
	- License: [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)

	Supported Languages:
	English, German, Spanish, French, Japanese, Portuguese, Arabic, Czech, Italian, Korean, Dutch, and Chinese. Users may finetune Granite-Embedding-278M-Multilingual for languages beyond these 12 languages.

	Intended use:
	The model is designed to produce fixed length vector representations for a given text, which can be used for text similarity, retrieval, and search applications.

	Usage with Sentence Transformers:
	The model is compatible with SentenceTransformer library and is very easy to use:

	First, install the sentence transformers library
	```shell
	pip install sentence_transformers
	```

	The model can then be used to encode pairs of text and find the similarity between their representations

	```python
	from sentence_transformers import SentenceTransformer, util

	model_path = "ibm-granite/granite-embedding-278m-multilingual"
	# Load the Sentence Transformer model
	model = SentenceTransformer(model_path)

	input_queries = [
	' Who made the song My achy breaky heart? ',
	'summit define'
	]

	input_passages = [
	"Achy Breaky Heart is a country song written by Don Von Tress. Originally titled Don't Tell My Heart and performed by The Marcy Brothers in 1991. ",
	"Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments."
	]

	# encode queries and passages
	query_embeddings = model.encode(input_queries)
	passage_embeddings = model.encode(input_passages)

	# calculate cosine similarity
	print(util.cos_sim(query_embeddings, passage_embeddings))
	```

	Usage with Huggingface Transformers:
	This is a simple example of how to use the Granite-Embedding-278m-Multilingual model with the Transformers library and PyTorch.

	First, install the required libraries
	```shell
	pip install transformers torch
	```

	The model can then be used to encode pairs of text

	```python
	import torch
	from transformers import AutoModel, AutoTokenizer

	model_path = "ibm-granite/granite-embedding-278m-multilingual"

	# Load the model and tokenizer
	model = AutoModel.from_pretrained(model_path)
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model.eval()

	input_queries = [
	' Who made the song My achy breaky heart? ',
	'summit define'
	]

	# tokenize inputs
	tokenized_queries = tokenizer(input_queries, padding=True, truncation=True, return_tensors='pt')

	# encode queries
	with torch.no_grad():
	# Queries
	model_output = model(**tokenized_queries)
	# Perform pooling. granite-embedding-278m-multilingual uses CLS Pooling
	query_embeddings = model_output[0][:, 0]

	# normalize the embeddings
	query_embeddings = torch.nn.functional.normalize(query_embeddings, dim=1)

	```

	Evaluation:
	The average performance of the Granite-Embedding-278M-Multilingual on Multilingual Miracl (across 18 langauges), Mintaka Retrieval (across 8 languages) and MTEB Retrieval for English (across 15 tasks), German (across 4 tasks), Spanish (across 2 tasks), Frenc (across 5 tasks), Japanese (across 2 tasks), Arabic (1 task), Korean (1 task) and Chinese (across 8 tasks) is reported below.

	\| Model \| Paramters (M)\| Embedding Dimension \| Miracl (18) \| Mintaka Retrieval (8) \| MTEB English (15) \| MTEB German (4) \|MTEB Spanish (2) \| MTEB French (5) \| MTEB Japanese (2) \| MTEB Arabic (1) \| MTEB Korean (1) \| MTEB Chinese (8) \|
	\|:-----------------------------------\|:------------:\|:-------------------:\|:-------------:\| :---------------------:\|:-----------------:\|:---------------:\|:---------------:\|:---------------:\|:----------------:\|:----------------:\|:---------------:\|:----------------:\|
	\|granite-embedding-278M-multilingual \| 278 \| 768 \| 58.3 \| 23.2 \| 48.2 \| 71.2 \| 52.6 \| 54.1 \| 61.7 \| 64.2 \| 71.8 \| 45.2 \|

	Model Architecture:
	Granite-Embedding-278m-Multilingual is based on an encoder-only XLM-RoBERTa like transformer architecture, trained internally at IBM Research.

	\| Model \| granite-embedding-30m-english \| granite-embedding-125m-english \| granite-embedding-107M-multilingual \| granite-embedding-278m-multilingual \|
	\| :-------- \| :-------:\| :-------: \| :---------:\| :-----:\|
	\| Embedding size \| 384 \| 768 \| 384 \| 768 \|
	\| Number of layers \| 6 \| 12 \| 6 \| 12 \|
	\| Number of attention heads \| 12 \| 12 \| 12 \| 12 \|
	\| Intermediate size \| 1536 \| 3072 \| 1536 \| 3072 \|
	\| Activation Function \| GeLU \| GeLU \| GeLU \| GeLU \|
	\| Vocabulary Size \| 50265 \| 50265 \| 250002 \| 250002 \|
	\| Max. Sequence Length \| 512 \| 512 \| 512 \| 512 \|
	\| # Parameters \| 30M \| 125M \| 107M \| 278M \|


	Training Data:
	Overall, the training data consists of four key sources: (1) unsupervised title-body paired data scraped from the web, (2) publicly available paired with permissive, enterprise-friendly license, (3) IBM-internal paired data targetting specific technical domains, and (4) IBM-generated synthetic data. The data is listed below:

	\| Dataset \| Num. Pairs \|
	\|:--------------------------------------------------------------------------\|:--------------:\|
	\| Multilingual MC4 \| 52,823,484 \|
	\| Multilingual Webhose \| 12,369,322 \|
	\| English Wikipedia \| 20,745,403 \|
	\| Multilingual Wikimedia \| 2,911,090 \|
	\| Miracl Corpus (Title-Body) \| 10,120,398 \|
	\| Stack Exchange Duplicate questions (titles) \| 304,525 \|
	\| Stack Exchange Duplicate questions (titles) \| 304,525 \|
	\| Stack Exchange Duplicate questions (bodies) \| 250,519 \|
	\| Machine Translations of Stack Exchange Duplicate questions (titles) \| 187,195 \|
	\| Stack Exchange (Title, Answer) pairs \| 4,067,139 \|
	\| Stack Exchange (Title, Body) pairs \| 23,978,013 \|
	\| Stack Exchange (Title, Body) pairs \| 23,978,013 \|
	\| Machine Translations of Stack Exchange (Title+Body, Answer) pairs \| 1,827,15 \|
	\| SearchQA \| 582,261 \|
	\| S2ORC (Title, Abstract) \| 41,769,185 \|
	\| WikiAnswers Duplicate question pairs \| 77,427,422 \|
	\| CCNews \| 614,664 \|
	\| XSum \| 226,711 \|
	\| SimpleWiki \| 102,225 \|
	\| Machine Translated Cross Lingual Parallel Corpora \| 28,376,115 \|
	\| SPECTER citation triplets \| 684,100 \|
	\| Machine Translations of SPECTER citation triplets \| 4,104,600 \|
	\| Natural Questions (NQ) \| 100,231 \|
	\| SQuAD2.0 \| 87,599 \|
	\| HotpotQA \| 85,000 \|
	\| Fever \| 109,810 \|
	\| PubMed \| 20,000,000 \|
	\| Multilingual Miracl Triples \| 81,409 \|
	\| Multilingual MrTydi Triples \| 48,715 \|
	\| Sadeeem Question Asnwering \| 4,037 \|
	\| DBPedia Title-Body Pairs \| 4,635,922 \|
	\| Synthetic: English Query-Wikipedia Passage \| 1,879,093 \|
	\| Synthetic: English Fact Verification \| 9,888 \|
	\| Synthetic: Multilingual Query-Wikipedia Passage \| 300,266 \|
	\| Synthetic: Multilingual News Summaries \| 37,489 \|
	\| IBM Internal Triples \| 40,290 \|
	\| IBM Internal Title-Body Pairs \| 1,524,586 \|

	Notably, we do not use the popular MS-MARCO retrieval dataset in our training corpus due to its non-commercial license, while other open-source models train on this dataset due to its high quality.

	Infrastructure:
	We train Granite Embedding Models using IBM's computing cluster, Cognitive Compute Cluster, which is outfitted with NVIDIA A100 80gb GPUs. This cluster provides a scalable and efficient infrastructure for training our models over multiple GPUs.

	Ethical Considerations and Limitations:
	The data used to train the base language model was filtered to remove text containing hate, abuse, and profanity. Granite-Embedding-278m-Multilingual is trained only for English texts, and has a context length of 512 tokens (longer texts will be truncated to this size).

	Resources
	- ⭐️ Learn about the latest updates with Granite: https://www.ibm.com/granite
	- 📄 Get started with tutorials, best practices, and prompt engineering advice: https://www.ibm.com/granite/docs/
	- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources

	<!-- ## Citation
	```
	@misc{granite-embedding-models,
	author = {author 1, author2, ...},
	title = {},
	journal = {},
	volume = {},
	year = {2024},
	url = {https://arxiv.org/abs/0000.00000},
	}
	``` -->