--- pipeline_tag: sentence-similarity tags: - sentence-transformers - feature-extraction - sentence-similarity datasets: - sbx/superlim-2 language: - sv --- # jzju/sbert-sv-lim2 This model Is trained from [KBLab/bert-base-swedish-cased-new](https://huggingface.co/KBLab/bert-base-swedish-cased-new) with data from [sbx/superlim-2](https://huggingface.co/datasets/sbx/superlim-2) This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 256 dimensional dense vector space and can be used for tasks like clustering or semantic search. ## Usage (Sentence-Transformers) Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: ``` pip install -U sentence-transformers ``` Then you can use the model like this: ```python from sentence_transformers import SentenceTransformer sentences = ["This is an example sentence", "Each sentence is converted"] model = SentenceTransformer('jzju/sbert-sv-lim2') embeddings = model.encode(sentences) print(embeddings) ``` ## Training Code ```python from datasets import load_dataset, concatenate_datasets from sentence_transformers import SentenceTransformer, InputExample, losses, models, util, datasets from torch.utils.data import DataLoader from torch import nn import random word_embedding_model = models.Transformer("KBLab/bert-base-swedish-cased-new", max_seq_length=256) pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) dense_model = models.Dense( in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh() ) model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model]) def pair(): def norm(x): x["label"] = x["label"] / m return x dd = [] for sub in ["swepar", "swesim_relatedness", "swesim_similarity"]: ds = concatenate_datasets([d for d in load_dataset("sbx/superlim-2", sub).values()]) if "sentence_1" in ds.features: ds = ds.rename_column("sentence_1", "d1") ds = ds.rename_column("sentence_2", "d2") else: ds = ds.rename_column("word_1", "d1") ds = ds.rename_column("word_2", "d2") m = max([d["label"] for d in ds]) dd.append(ds.map(norm)) ds = concatenate_datasets(dd) train_examples = [] for d in ds: train_examples.append(InputExample(texts=[d["d1"], d["d2"]], label=d["label"])) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64) train_loss = losses.CosineSimilarityLoss(model) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=10, warmup_steps=100) def nli(): ds = concatenate_datasets([d for d in load_dataset("sbx/superlim-2", "swenli").values()]) def add_to_samples(sent1, sent2, label): if sent1 not in train_data: train_data[sent1] = {0: set(), 1: set(), 2: set()} train_data[sent1][label].add(sent2) train_data = {} for d in ds: add_to_samples(d["premise"], d["hypothesis"], d["label"]) add_to_samples(d["hypothesis"], d["premise"], d["label"]) train_samples = [] for sent1, others in train_data.items(): if len(others[0]) > 0 and len(others[1]) > 0: train_samples.append( InputExample(texts=[sent1, random.choice(list(others[0])), random.choice(list(others[1]))]) ) train_samples.append( InputExample(texts=[random.choice(list(others[0])), sent1, random.choice(list(others[1]))]) ) train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=64) train_loss = losses.MultipleNegativesRankingLoss(model) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100) pair() nli() model.save() ```