Initial commit

Browse files

Files changed (11) hide show

.gitattributes +3 -0
README.md +77 -0
config.json +53 -0
kilt_titles_trie_dict.pkl +3 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
tf_model.h5 +3 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
trie.py +93 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
+tf_model.h5 filter=lfs diff=lfs merge=lfs -text
+kilt_titles_trie_dict.pkl filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+---
+language:
+- en
+tags:
+- retrieval
+- entity-retrieval
+- named-entity-disambiguation
+- entity-disambiguation
+- named-entity-linking
+- entity-linking
+- text2text-generation
+---
+# GENRE
+The GENRE (Generative ENtity REtrieval) system as presented in [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) implemented in pytorch.
+In a nutshell, GENRE uses a sequence-to-sequence approach to entity retrieval (e.g., linking), based on fine-tuned [BART](https://arxiv.org/abs/1910.13461) architecture. GENRE performs retrieval generating the unique entity name conditioned on the input text using constrained beam search to only generate valid identifiers. The model was first released in the [facebookresearch/GENRE](https://github.com/facebookresearch/GENRE) repository using `fairseq` (the `transformers` models are obtained with a conversion script similar to [this](https://github.com/huggingface/transformers/blob/master/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py).
+This model was trained on the full training set of [BLINK](https://arxiv.org/abs/1911.03814) (i.e., 9M datapoints for entity-disambiguation grounded on Wikipedia).
+## BibTeX entry and citation info
+**Please consider citing our works if you use code from this repository.**
+```bibtex
+@inproceedings{decao2020autoregressive,
+  title={Autoregressive Entity Retrieval},
+  author={Nicola {De Cao} and Gautier Izacard and Sebastian Riedel and Fabio Petroni},
+  booktitle={International Conference on Learning Representations},
+  url={https://openreview.net/forum?id=5k8F6UU39V},
+  year={2021}
+}
+```
+## Usage
+Here is an example of generation for Wikipedia page disambiguation:
+```python
+import pickle
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# OPTIONAL: load the prefix tree (trie), you need to additionally download
+# https://huggingface.co/facebook/genre-kilt/blob/main/trie.py and
+# https://huggingface.co/facebook/genre-kilt/blob/main/kilt_titles_trie_dict.pkl
+# from trie import Trie
+# with open("kilt_titles_trie_dict.pkl", "rb") as f:
+#     trie = Trie.load_from_dict(pickle.load(f))
+tokenizer = AutoTokenizer.from_pretrained("facebook/genre-linking-blink")
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/genre-linking-blink").eval()
+sentences = ["Einstein was a [START_ENT] German [END_ENT] physicist."]
+outputs = model.generate(
+    **tokenizer(sentences, return_tensors="pt"),
+    num_beams=5,
+    num_return_sequences=5,
+    # OPTIONAL: use constrained beam search
+    # prefix_allowed_tokens_fn=lambda batch_id, sent: trie.get(sent.tolist()),
+)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+```
+which outputs the following top-5 predictions (using constrained beam search)
+```
+['Germans',
+ 'Germany',
+ 'German Empire',
+ 'Weimar Republic',
+ 'Greeks']
+```

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "_name_or_path": "facebook/genre-kilt",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "add_bias_logits": false,
+  "add_final_layer_norm": false,
+  "architectures": [
+    "BartForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "classif_dropout": 0.0,
+  "classifier_dropout": 0.0,
+  "d_model": 1024,
+  "decoder_attention_heads": 16,
+  "decoder_ffn_dim": 4096,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 12,
+  "decoder_start_token_id": 2,
+  "dropout": 0.1,
+  "early_stopping": true,
+  "encoder_attention_heads": 16,
+  "encoder_ffn_dim": 4096,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 12,
+  "eos_token_id": 2,
+  "eos_token_ids": [
+    2
+  ],
+  "forced_eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "max_length": 1024,
+  "max_position_embeddings": 1024,
+  "min_length": 0,
+  "model_type": "bart",
+  "normalize_before": false,
+  "normalize_embedding": false,
+  "num_beams": 6,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 1,
+  "replacing_rate": 0,
+  "scale_embedding": false,
+  "static_position_embeddings": false,
+  "student_decoder_layers": null,
+  "student_encoder_layers": null,
+  "task_specific_params": {},
+  "transformers_version": "4.19.2",
+  "use_cache": true,
+  "vocab_size": 50264
+}

kilt_titles_trie_dict.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:951db72cc702fcf6639419efcf917cb7f3c67cc6202ebe3ae3ca399c30614da2
+size 215214973

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d105d545961fe8eec7183bab63dd5dea9acf4cd69783827a4151bda989635d1e
+size 1625526529

tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a854657f17fa38492440f5111c7f78e1e1bdd75e58eff59b5260894ba183e58b
+size 1625921384

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"model_max_length": 1024}

trie.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree at
+# https://github.com/facebookresearch/GENRE .
+from typing import Dict, List
+class Trie(object):
+    def __init__(self, sequences: List[List[int]] = []):
+        self.trie_dict = {}
+        self.len = 0
+        if sequences:
+            for sequence in sequences:
+                Trie._add_to_trie(sequence, self.trie_dict)
+                self.len += 1
+        self.append_trie = None
+        self.bos_token_id = None
+    def append(self, trie, bos_token_id):
+        self.append_trie = trie
+        self.bos_token_id = bos_token_id
+    def add(self, sequence: List[int]):
+        Trie._add_to_trie(sequence, self.trie_dict)
+        self.len += 1
+    def get(self, prefix_sequence: List[int]):
+        return Trie._get_from_trie(
+            prefix_sequence, self.trie_dict, self.append_trie, self.bos_token_id
+        )
+    @staticmethod
+    def load_from_dict(trie_dict):
+        trie = Trie()
+        trie.trie_dict = trie_dict
+        trie.len = sum(1 for _ in trie)
+        return trie
+    @staticmethod
+    def _add_to_trie(sequence: List[int], trie_dict: Dict):
+        if sequence:
+            if sequence[0] not in trie_dict:
+                trie_dict[sequence[0]] = {}
+            Trie._add_to_trie(sequence[1:], trie_dict[sequence[0]])
+    @staticmethod
+    def _get_from_trie(
+        prefix_sequence: List[int],
+        trie_dict: Dict,
+        append_trie=None,
+        bos_token_id: int = None,
+    ):
+        if len(prefix_sequence) == 0:
+            output = list(trie_dict.keys())
+            if append_trie and bos_token_id in output:
+                output.remove(bos_token_id)
+                output += list(append_trie.trie_dict.keys())
+            return output
+        elif prefix_sequence[0] in trie_dict:
+            return Trie._get_from_trie(
+                prefix_sequence[1:],
+                trie_dict[prefix_sequence[0]],
+                append_trie,
+                bos_token_id,
+            )
+        else:
+            if append_trie:
+                return append_trie.get(prefix_sequence)
+            else:
+                return []
+    def __iter__(self):
+        def _traverse(prefix_sequence, trie_dict):
+            if trie_dict:
+                for next_token in trie_dict:
+                    yield from _traverse(
+                        prefix_sequence + [next_token], trie_dict[next_token]
+                    )
+            else:
+                yield prefix_sequence
+        return _traverse([], self.trie_dict)
+    def __len__(self):
+        return self.len
+    def __getitem__(self, value):
+        return self.get(value)

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff