Spaces:

nielklug
/

enhg-parsing

Sleeping

App Files Files Community

nielklug commited on Jun 17

Commit

8778cfe

•

1 Parent(s): 7884ed6

add parsing

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

__pycache__/parse.cpython-38.pyc +0 -0
app.py +11 -9
benepar/__init__.py +20 -0
benepar/__pycache__/__init__.cpython-310.pyc +0 -0
benepar/__pycache__/__init__.cpython-37.pyc +0 -0
benepar/__pycache__/__init__.cpython-38.pyc +0 -0
benepar/__pycache__/char_lstm.cpython-310.pyc +0 -0
benepar/__pycache__/char_lstm.cpython-37.pyc +0 -0
benepar/__pycache__/char_lstm.cpython-38.pyc +0 -0
benepar/__pycache__/decode_chart.cpython-310.pyc +0 -0
benepar/__pycache__/decode_chart.cpython-37.pyc +0 -0
benepar/__pycache__/decode_chart.cpython-38.pyc +0 -0
benepar/__pycache__/nkutil.cpython-310.pyc +0 -0
benepar/__pycache__/nkutil.cpython-37.pyc +0 -0
benepar/__pycache__/nkutil.cpython-38.pyc +0 -0
benepar/__pycache__/parse_base.cpython-310.pyc +0 -0
benepar/__pycache__/parse_base.cpython-37.pyc +0 -0
benepar/__pycache__/parse_base.cpython-38.pyc +0 -0
benepar/__pycache__/parse_chart.cpython-310.pyc +0 -0
benepar/__pycache__/parse_chart.cpython-37.pyc +0 -0
benepar/__pycache__/parse_chart.cpython-38.pyc +0 -0
benepar/__pycache__/partitioned_transformer.cpython-310.pyc +0 -0
benepar/__pycache__/partitioned_transformer.cpython-37.pyc +0 -0
benepar/__pycache__/partitioned_transformer.cpython-38.pyc +0 -0
benepar/__pycache__/ptb_unescape.cpython-310.pyc +0 -0
benepar/__pycache__/ptb_unescape.cpython-37.pyc +0 -0
benepar/__pycache__/ptb_unescape.cpython-38.pyc +0 -0
benepar/__pycache__/retokenization.cpython-310.pyc +0 -0
benepar/__pycache__/retokenization.cpython-37.pyc +0 -0
benepar/__pycache__/retokenization.cpython-38.pyc +0 -0
benepar/__pycache__/subbatching.cpython-310.pyc +0 -0
benepar/__pycache__/subbatching.cpython-37.pyc +0 -0
benepar/__pycache__/subbatching.cpython-38.pyc +0 -0
benepar/char_lstm.py +160 -0
benepar/decode_chart.py +291 -0
benepar/decode_chart.py~ +291 -0
benepar/integrations/__init__.py +0 -0
benepar/integrations/__pycache__/__init__.cpython-310.pyc +0 -0
benepar/integrations/__pycache__/__init__.cpython-37.pyc +0 -0
benepar/integrations/__pycache__/__init__.cpython-38.pyc +0 -0
benepar/integrations/__pycache__/downloader.cpython-310.pyc +0 -0
benepar/integrations/__pycache__/downloader.cpython-37.pyc +0 -0
benepar/integrations/__pycache__/downloader.cpython-38.pyc +0 -0
benepar/integrations/__pycache__/nltk_plugin.cpython-310.pyc +0 -0
benepar/integrations/__pycache__/nltk_plugin.cpython-37.pyc +0 -0
benepar/integrations/__pycache__/nltk_plugin.cpython-38.pyc +0 -0
benepar/integrations/__pycache__/spacy_extensions.cpython-310.pyc +0 -0
benepar/integrations/__pycache__/spacy_extensions.cpython-37.pyc +0 -0
benepar/integrations/__pycache__/spacy_extensions.cpython-38.pyc +0 -0
benepar/integrations/__pycache__/spacy_plugin.cpython-310.pyc +0 -0

__pycache__/parse.cpython-38.pyc CHANGED Viewed

Binary files a/__pycache__/parse.cpython-38.pyc and b/__pycache__/parse.cpython-38.pyc differ

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import streamlit as st
-# from parse import parse_text
 from nltk import Tree
 import pandas as pd
 import re
@@ -31,19 +31,21 @@ if text:
   df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
-  # # Convert the bracket parse tree into an NLTK Tree
-  # t = Tree.fromstring(re.sub(r'(\.[^ )]+)+', '', parse_tree))
-  # tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
   col1 = st.columns(1)[0]
   col1.header("POS tagging result:")
   col1.table(df)
-#   col2 = st.columns(1)[0]
-#   col2.header("Parsing result:")
-#   col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
-# # Display the graph in the Streamlit app
-#   col2.image(tree_svg, use_column_width=True)

 import streamlit as st
+from parse import parse
 from nltk import Tree
 import pandas as pd
 import re
   df = pd.DataFrame(zipped, columns=['Token', 'Tag', 'Prob.'])
+  parse_tree = parse(tokens)
+  # Convert the bracket parse tree into an NLTK Tree
+  t = Tree.fromstring(re.sub(r'-[^ )]*', '', parse_tree))
+  tree_svg = TreePrettyPrinter(t).svg(nodecolor='black', leafcolor='black', funccolor='black')
   col1 = st.columns(1)[0]
   col1.header("POS tagging result:")
   col1.table(df)
+  col2 = st.columns(1)[0]
+  col2.header("Parsing result:")
+  col2.write(parse_tree.replace('_', '\_').replace('$', '\$').replace('*', '\*'))
+# Display the graph in the Streamlit app
+  col2.image(tree_svg, use_column_width=True)

benepar/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""
+benepar: Berkeley Neural Parser
+"""
+# This file and all code in integrations/ relate to the version of the parser
+# released via PyPI. If you only need to run research experiments, it is safe
+# to delete the integrations/ folder and replace this __init__.py with an
+# empty file.
+__all__ = [
+    "Parser",
+    "InputSentence",
+    "download",
+    "BeneparComponent",
+    "NonConstituentException",
+]
+from .integrations.downloader import download
+from .integrations.nltk_plugin import Parser, InputSentence
+from .integrations.spacy_plugin import BeneparComponent, NonConstituentException

benepar/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (526 Bytes). View file

benepar/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (521 Bytes). View file

benepar/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (505 Bytes). View file

benepar/__pycache__/char_lstm.cpython-310.pyc ADDED Viewed

Binary file (4.94 kB). View file

benepar/__pycache__/char_lstm.cpython-37.pyc ADDED Viewed

Binary file (4.92 kB). View file

benepar/__pycache__/char_lstm.cpython-38.pyc ADDED Viewed

Binary file (4.96 kB). View file

benepar/__pycache__/decode_chart.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

benepar/__pycache__/decode_chart.cpython-37.pyc ADDED Viewed

Binary file (10.2 kB). View file

benepar/__pycache__/decode_chart.cpython-38.pyc ADDED Viewed

Binary file (10.2 kB). View file

benepar/__pycache__/nkutil.cpython-310.pyc ADDED Viewed

Binary file (2.14 kB). View file

benepar/__pycache__/nkutil.cpython-37.pyc ADDED Viewed

Binary file (2.1 kB). View file

benepar/__pycache__/nkutil.cpython-38.pyc ADDED Viewed

Binary file (2.09 kB). View file

benepar/__pycache__/parse_base.cpython-310.pyc ADDED Viewed

Binary file (7.38 kB). View file

benepar/__pycache__/parse_base.cpython-37.pyc ADDED Viewed

Binary file (7.16 kB). View file

benepar/__pycache__/parse_base.cpython-38.pyc ADDED Viewed

Binary file (7.26 kB). View file

benepar/__pycache__/parse_chart.cpython-310.pyc ADDED Viewed

Binary file (11 kB). View file

benepar/__pycache__/parse_chart.cpython-37.pyc ADDED Viewed

Binary file (11 kB). View file

benepar/__pycache__/parse_chart.cpython-38.pyc ADDED Viewed

Binary file (11.1 kB). View file

benepar/__pycache__/partitioned_transformer.cpython-310.pyc ADDED Viewed

Binary file (7.83 kB). View file

benepar/__pycache__/partitioned_transformer.cpython-37.pyc ADDED Viewed

Binary file (7.9 kB). View file

benepar/__pycache__/partitioned_transformer.cpython-38.pyc ADDED Viewed

Binary file (7.82 kB). View file

benepar/__pycache__/ptb_unescape.cpython-310.pyc ADDED Viewed

Binary file (3.05 kB). View file

benepar/__pycache__/ptb_unescape.cpython-37.pyc ADDED Viewed

Binary file (3.2 kB). View file

benepar/__pycache__/ptb_unescape.cpython-38.pyc ADDED Viewed

Binary file (3.19 kB). View file

benepar/__pycache__/retokenization.cpython-310.pyc ADDED Viewed

Binary file (6.83 kB). View file

benepar/__pycache__/retokenization.cpython-37.pyc ADDED Viewed

Binary file (6.73 kB). View file

benepar/__pycache__/retokenization.cpython-38.pyc ADDED Viewed

Binary file (6.83 kB). View file

benepar/__pycache__/subbatching.cpython-310.pyc ADDED Viewed

Binary file (2.53 kB). View file

benepar/__pycache__/subbatching.cpython-37.pyc ADDED Viewed

Binary file (2.49 kB). View file

benepar/__pycache__/subbatching.cpython-38.pyc ADDED Viewed

Binary file (2.48 kB). View file

benepar/char_lstm.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+Character LSTM implementation (matches https://arxiv.org/pdf/1805.01052.pdf)
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CharacterLSTM(nn.Module):
+    def __init__(self, num_embeddings, d_embedding, d_out, char_dropout=0.0, **kwargs):
+        super().__init__()
+        self.d_embedding = d_embedding
+        self.d_out = d_out
+        self.lstm = nn.LSTM(
+            self.d_embedding, self.d_out // 2, num_layers=1, bidirectional=True
+        )
+        self.emb = nn.Embedding(num_embeddings, self.d_embedding, **kwargs)
+        self.char_dropout = nn.Dropout(char_dropout)
+    def forward(self, chars_packed, valid_token_mask):
+        inp_embs = nn.utils.rnn.PackedSequence(
+            self.char_dropout(self.emb(chars_packed.data)),
+            batch_sizes=chars_packed.batch_sizes,
+            sorted_indices=chars_packed.sorted_indices,
+            unsorted_indices=chars_packed.unsorted_indices,
+        )
+        _, (lstm_out, _) = self.lstm(inp_embs)
+        lstm_out = torch.cat([lstm_out[0], lstm_out[1]], -1)
+        # Switch to a representation where there are dummy vectors for invalid
+        # tokens generated by padding.
+        res = lstm_out.new_zeros(
+            (valid_token_mask.shape[0], valid_token_mask.shape[1], lstm_out.shape[-1])
+        )
+        res[valid_token_mask] = lstm_out
+        return res
+class RetokenizerForCharLSTM:
+    # Assumes that these control characters are not present in treebank text
+    CHAR_UNK = "\0"
+    CHAR_ID_UNK = 0
+    CHAR_START_SENTENCE = "\1"
+    CHAR_START_WORD = "\2"
+    CHAR_STOP_WORD = "\3"
+    CHAR_STOP_SENTENCE = "\4"
+    def __init__(self, char_vocab):
+        self.char_vocab = char_vocab
+    @classmethod
+    def build_vocab(cls, sentences):
+        char_set = set()
+        for sentence in sentences:
+            if isinstance(sentence, tuple):
+                sentence = sentence[0]
+            for word in sentence:
+                char_set |= set(word)
+        # If codepoints are small (e.g. Latin alphabet), index by codepoint
+        # directly
+        highest_codepoint = max(ord(char) for char in char_set)
+        if highest_codepoint < 512:
+            if highest_codepoint < 256:
+                highest_codepoint = 256
+            else:
+                highest_codepoint = 512
+            char_vocab = {}
+            # This also takes care of constants like CHAR_UNK, etc.
+            for codepoint in range(highest_codepoint):
+                char_vocab[chr(codepoint)] = codepoint
+            return char_vocab
+        else:
+            char_vocab = {}
+            char_vocab[cls.CHAR_UNK] = 0
+            char_vocab[cls.CHAR_START_SENTENCE] = 1
+            char_vocab[cls.CHAR_START_WORD] = 2
+            char_vocab[cls.CHAR_STOP_WORD] = 3
+            char_vocab[cls.CHAR_STOP_SENTENCE] = 4
+            for id_, char in enumerate(sorted(char_set), start=5):
+                char_vocab[char] = id_
+            return char_vocab
+    def __call__(self, words, space_after="ignored", return_tensors=None):
+        if return_tensors != "np":
+            raise NotImplementedError("Only return_tensors='np' is supported.")
+        res = {}
+        # Sentence-level start/stop tokens are encoded as 3 pseudo-chars
+        # Within each word, account for 2 start/stop characters
+        max_word_len = max(3, max(len(word) for word in words)) + 2
+        char_ids = np.zeros((len(words) + 2, max_word_len), dtype=int)
+        word_lens = np.zeros(len(words) + 2, dtype=int)
+        char_ids[0, :5] = [
+            self.char_vocab[self.CHAR_START_WORD],
+            self.char_vocab[self.CHAR_START_SENTENCE],
+            self.char_vocab[self.CHAR_START_SENTENCE],
+            self.char_vocab[self.CHAR_START_SENTENCE],
+            self.char_vocab[self.CHAR_STOP_WORD],
+        ]
+        word_lens[0] = 5
+        for i, word in enumerate(words, start=1):
+            char_ids[i, 0] = self.char_vocab[self.CHAR_START_WORD]
+            for j, char in enumerate(word, start=1):
+                char_ids[i, j] = self.char_vocab.get(char, self.CHAR_ID_UNK)
+            char_ids[i, j + 1] = self.char_vocab[self.CHAR_STOP_WORD]
+            word_lens[i] = j + 2
+        char_ids[i + 1, :5] = [
+            self.char_vocab[self.CHAR_START_WORD],
+            self.char_vocab[self.CHAR_STOP_SENTENCE],
+            self.char_vocab[self.CHAR_STOP_SENTENCE],
+            self.char_vocab[self.CHAR_STOP_SENTENCE],
+            self.char_vocab[self.CHAR_STOP_WORD],
+        ]
+        word_lens[i + 1] = 5
+        res["char_ids"] = char_ids
+        res["word_lens"] = word_lens
+        res["valid_token_mask"] = np.ones_like(word_lens, dtype=bool)
+        return res
+    def pad(self, examples, return_tensors=None):
+        if return_tensors != "pt":
+            raise NotImplementedError("Only return_tensors='pt' is supported.")
+        max_word_len = max(example["char_ids"].shape[-1] for example in examples)
+        char_ids = torch.cat(
+            [
+                F.pad(
+                    torch.tensor(example["char_ids"]),
+                    (0, max_word_len - example["char_ids"].shape[-1]),
+                )
+                for example in examples
+            ]
+        )
+        word_lens = torch.cat(
+            [torch.tensor(example["word_lens"]) for example in examples]
+        )
+        valid_token_mask = nn.utils.rnn.pad_sequence(
+            [torch.tensor(example["valid_token_mask"]) for example in examples],
+            batch_first=True,
+            padding_value=False,
+        )
+        char_ids = nn.utils.rnn.pack_padded_sequence(
+            char_ids, word_lens, batch_first=True, enforce_sorted=False
+        )
+        return {
+            "char_ids": char_ids,
+            "valid_token_mask": valid_token_mask,
+        }

benepar/decode_chart.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Parsing formulated as span classification (https://arxiv.org/abs/1705.03919)
+"""
+import nltk
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_struct
+from .parse_base import CompressedParserOutput
+def pad_charts(charts, padding_value=-100):
+    """Pad a list of variable-length charts with `padding_value`."""
+    batch_size = len(charts)
+    max_len = max(chart.shape[0] for chart in charts)
+    padded_charts = torch.full(
+        (batch_size, max_len, max_len),
+        padding_value,
+        dtype=charts[0].dtype,
+        device=charts[0].device,
+    )
+    for i, chart in enumerate(charts):
+        chart_size = chart.shape[0]
+        padded_charts[i, :chart_size, :chart_size] = chart
+    return padded_charts
+def collapse_unary_strip_pos(tree, strip_top=True):
+    """Collapse unary chains and strip part of speech tags."""
+    def strip_pos(tree):
+        if len(tree) == 1 and isinstance(tree[0], str):
+            return tree[0]
+        else:
+            return nltk.tree.Tree(tree.label(), [strip_pos(child) for child in tree])
+    collapsed_tree = strip_pos(tree)
+    collapsed_tree.collapse_unary(collapsePOS=True, joinChar="::")
+    if collapsed_tree.label() in ("TOP", "ROOT", "S1", "VROOT"):
+        if strip_top:
+            if len(collapsed_tree) == 1:
+                collapsed_tree = collapsed_tree[0]
+            else:
+                collapsed_tree.set_label("")
+        elif len(collapsed_tree) == 1:
+            collapsed_tree[0].set_label(
+                collapsed_tree.label() + "::" + collapsed_tree[0].label())
+            collapsed_tree = collapsed_tree[0]
+    return collapsed_tree
+def _get_labeled_spans(tree, spans_out, start):
+    if isinstance(tree, str):
+        return start + 1
+    assert len(tree) > 1 or isinstance(
+        tree[0], str
+    ), "Must call collapse_unary_strip_pos first"
+    end = start
+    for child in tree:
+        end = _get_labeled_spans(child, spans_out, end)
+    # Spans are returned as closed intervals on both ends
+    spans_out.append((start, end - 1, tree.label()))
+    return end
+def get_labeled_spans(tree):
+    """Converts a tree into a list of labeled spans.
+    Args:
+        tree: an nltk.tree.Tree object
+    Returns:
+        A list of (span_start, span_end, span_label) tuples. The start and end
+        indices indicate the first and last words of the span (a closed
+        interval). Unary chains are collapsed, so e.g. a (S (VP ...)) will
+        result in a single span labeled "S+VP".
+    """
+    tree = collapse_unary_strip_pos(tree)
+    spans_out = []
+    _get_labeled_spans(tree, spans_out, start=0)
+    return spans_out
+def uncollapse_unary(tree, ensure_top=False):
+    """Un-collapse unary chains."""
+    if isinstance(tree, str):
+        return tree
+    else:
+        labels = tree.label().split("::")
+        if ensure_top and labels[0] != "TOP":
+            labels = ["TOP"] + labels
+        children = []
+        for child in tree:
+            child = uncollapse_unary(child)
+            children.append(child)
+        for label in labels[::-1]:
+            children = [nltk.tree.Tree(label, children)]
+        return children[0]
+class ChartDecoder:
+    """A chart decoder for parsing formulated as span classification."""
+    def __init__(self, label_vocab, force_root_constituent=True):
+        """Constructs a new ChartDecoder object.
+        Args:
+            label_vocab: A mapping from span labels to integer indices.
+        """
+        self.label_vocab = label_vocab
+        self.label_from_index = {i: label for label, i in label_vocab.items()}
+        self.force_root_constituent = force_root_constituent
+    @staticmethod
+    def build_vocab(trees):
+        label_set = set()
+        for tree in trees:
+            for _, _, label in get_labeled_spans(tree):
+                if label:
+                    label_set.add(label)
+        label_set = [""] + sorted(label_set)
+        return {label: i for i, label in enumerate(label_set)}
+    @staticmethod
+    def infer_force_root_constituent(trees):
+        for tree in trees:
+            for _, _, label in get_labeled_spans(tree):
+                if not label:
+                    return False
+        return True
+    def chart_from_tree(self, tree):
+        spans = get_labeled_spans(tree)
+        num_words = len(tree.leaves())
+        chart = np.full((num_words, num_words), -100, dtype=int)
+        chart = np.tril(chart, -1)
+        # Now all invalid entries are filled with -100, and valid entries with 0
+        for start, end, label in spans:
+            # Previously unseen unary chains can occur in the dev/test sets.
+            # For now, we ignore them and don't mark the corresponding chart
+            # entry as a constituent.
+            if label in self.label_vocab:
+                chart[start, end] = self.label_vocab[label]
+        return chart
+    def charts_from_pytorch_scores_batched(self, scores, lengths):
+        """Runs CKY to recover span labels from scores (e.g. logits).
+        This method uses pytorch-struct to speed up decoding compared to the
+        pure-Python implementation of CKY used by tree_from_scores().
+        Args:
+            scores: a pytorch tensor of shape (batch size, max length,
+                max length, label vocab size).
+            lengths: a pytorch tensor of shape (batch size,)
+        Returns:
+            A list of numpy arrays, each of shape (sentence length, sentence
+                length).
+        """
+        scores = scores.detach()
+        scores = scores - scores[..., :1]
+        if self.force_root_constituent:
+            scores[torch.arange(scores.shape[0]), 0, lengths - 1, 0] -= 1e9
+        dist = torch_struct.TreeCRF(scores, lengths=lengths)
+        amax = dist.argmax
+        amax[..., 0] += 1e-9
+        padded_charts = amax.argmax(-1)
+        padded_charts = padded_charts.detach().cpu().numpy()
+        return [
+            chart[:length, :length] for chart, length in zip(padded_charts, lengths)
+        ]
+    def compressed_output_from_chart(self, chart):
+        chart_with_filled_diagonal = chart.copy()
+        np.fill_diagonal(chart_with_filled_diagonal, 1)
+        chart_with_filled_diagonal[0, -1] = 1
+        starts, inclusive_ends = np.where(chart_with_filled_diagonal)
+        preorder_sort = np.lexsort((-inclusive_ends, starts))
+        starts = starts[preorder_sort]
+        inclusive_ends = inclusive_ends[preorder_sort]
+        labels = chart[starts, inclusive_ends]
+        ends = inclusive_ends + 1
+        return CompressedParserOutput(starts=starts, ends=ends, labels=labels)
+    def tree_from_chart(self, chart, leaves):
+        compressed_output = self.compressed_output_from_chart(chart)
+        return compressed_output.to_tree(leaves, self.label_from_index)
+    def tree_from_scores(self, scores, leaves):
+        """Runs CKY to decode a tree from scores (e.g. logits).
+        If speed is important, consider using charts_from_pytorch_scores_batched
+        followed by compressed_output_from_chart or tree_from_chart instead.
+        Args:
+            scores: a chart of scores (or logits) of shape
+                (sentence length, sentence length, label vocab size). The first
+                two dimensions may be padded to a longer length, but all padded
+                values will be ignored.
+            leaves: the leaf nodes to use in the constructed tree. These
+                may be of type str or nltk.Tree, or (word, tag) tuples that
+                will be used to construct the leaf node objects.
+        Returns:
+            An nltk.Tree object.
+        """
+        leaves = [
+            nltk.Tree(node[1], [node[0]]) if isinstance(node, tuple) else node
+            for node in leaves
+        ]
+        chart = {}
+        scores = scores - scores[:, :, 0, None]
+        for length in range(1, len(leaves) + 1):
+            for left in range(0, len(leaves) + 1 - length):
+                right = left + length
+                label_scores = scores[left, right - 1]
+                label_scores = label_scores - label_scores[0]
+                argmax_label_index = int(
+                    label_scores.argmax()
+                    if length < len(leaves) or not self.force_root_constituent
+                    else label_scores[1:].argmax() + 1
+                )
+                argmax_label = self.label_from_index[argmax_label_index]
+                label = argmax_label
+                label_score = label_scores[argmax_label_index]
+                if length == 1:
+                    tree = leaves[left]
+                    if label:
+                        tree = nltk.tree.Tree(label, [tree])
+                    chart[left, right] = [tree], label_score
+                    continue
+                best_split = max(
+                    range(left + 1, right),
+                    key=lambda split: (chart[left, split][1] + chart[split, right][1]),
+                )
+                left_trees, left_score = chart[left, best_split]
+                right_trees, right_score = chart[best_split, right]
+                children = left_trees + right_trees
+                if label:
+                    children = [nltk.tree.Tree(label, children)]
+                chart[left, right] = (children, label_score + left_score + right_score)
+        children, score = chart[0, len(leaves)]
+        tree = nltk.tree.Tree("TOP", children)
+        tree = uncollapse_unary(tree)
+        return tree
+class SpanClassificationMarginLoss(nn.Module):
+    def __init__(self, force_root_constituent=True, reduction="mean"):
+        super().__init__()
+        self.force_root_constituent = force_root_constituent
+        if reduction not in ("none", "mean", "sum"):
+            raise ValueError(f"Invalid value for reduction: {reduction}")
+        self.reduction = reduction
+    def forward(self, logits, labels):
+        gold_event = F.one_hot(F.relu(labels), num_classes=logits.shape[-1])
+        logits = logits - logits[..., :1]
+        lengths = (labels[:, 0, :] != -100).sum(-1)
+        augment = (1 - gold_event).to(torch.float)
+        if self.force_root_constituent:
+            augment[torch.arange(augment.shape[0]), 0, lengths - 1, 0] -= 1e9
+        dist = torch_struct.TreeCRF(logits + augment, lengths=lengths)
+        pred_score = dist.max
+        gold_score = (logits * gold_event).sum((1, 2, 3))
+        margin_losses = F.relu(pred_score - gold_score)
+        if self.reduction == "none":
+            return margin_losses
+        elif self.reduction == "mean":
+            return margin_losses.mean()
+        elif self.reduction == "sum":
+            return margin_losses.sum()
+        else:
+            assert False, f"Unexpected reduction: {self.reduction}"

benepar/decode_chart.py~ ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Parsing formulated as span classification (https://arxiv.org/abs/1705.03919)
+"""
+import nltk
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch_struct
+from .parse_base import CompressedParserOutput
+def pad_charts(charts, padding_value=-100):
+    """Pad a list of variable-length charts with `padding_value`."""
+    batch_size = len(charts)
+    max_len = max(chart.shape[0] for chart in charts)
+    padded_charts = torch.full(
+        (batch_size, max_len, max_len),
+        padding_value,
+        dtype=charts[0].dtype,
+        device=charts[0].device,
+    )
+    for i, chart in enumerate(charts):
+        chart_size = chart.shape[0]
+        padded_charts[i, :chart_size, :chart_size] = chart
+    return padded_charts
+def collapse_unary_strip_pos(tree, strip_top=True):
+    """Collapse unary chains and strip part of speech tags."""
+    def strip_pos(tree):
+        if len(tree) == 1 and isinstance(tree[0], str):
+            return tree[0]
+        else:
+            return nltk.tree.Tree(tree.label(), [strip_pos(child) for child in tree])
+    collapsed_tree = strip_pos(tree)
+    collapsed_tree.collapse_unary(collapsePOS=True, joinChar="::")
+    if collapsed_tree.label() in ("TOP", "ROOT", "S1", "VROOT"):
+        if strip_top:
+            if len(collapsed_tree) == 1:
+                collapsed_tree = collapsed_tree[0]
+            else:
+                collapsed_tree.set_label("")
+        elif len(collapsed_tree) == 1:
+            collapsed_tree[0].set_label(
+                collapsed_tree.label() + "::" + collapsed_tree[0].label())
+            collapsed_tree = collapsed_tree[0]
+    return collapsed_tree
+def _get_labeled_spans(tree, spans_out, start):
+    if isinstance(tree, str):
+        return start + 1
+    assert len(tree) > 1 or isinstance(
+        tree[0], str
+    ), "Must call collapse_unary_strip_pos first"
+    end = start
+    for child in tree:
+        end = _get_labeled_spans(child, spans_out, end)
+    # Spans are returned as closed intervals on both ends
+    spans_out.append((start, end - 1, tree.label()))
+    return end
+def get_labeled_spans(tree):
+    """Converts a tree into a list of labeled spans.
+    Args:
+        tree: an nltk.tree.Tree object
+    Returns:
+        A list of (span_start, span_end, span_label) tuples. The start and end
+        indices indicate the first and last words of the span (a closed
+        interval). Unary chains are collapsed, so e.g. a (S (VP ...)) will
+        result in a single span labeled "S+VP".
+    """
+    tree = collapse_unary_strip_pos(tree)
+    spans_out = []
+    _get_labeled_spans(tree, spans_out, start=0)
+    return spans_out
+def uncollapse_unary(tree, ensure_top=False):
+    """Un-collapse unary chains."""
+    if isinstance(tree, str):
+        return tree
+    else:
+        labels = tree.label().split("::")
+        if ensure_top and labels[0] != "TOP":
+            labels = ["TOP"] + labels
+        children = []
+        for child in tree:
+            child = uncollapse_unary(child)
+            children.append(child)
+        for label in labels[::-1]:
+            children = [nltk.tree.Tree(label, children)]
+        return children[0]
+class ChartDecoder:
+    """A chart decoder for parsing formulated as span classification."""
+    def __init__(self, label_vocab, force_root_constituent=True):
+        """Constructs a new ChartDecoder object.
+        Args:
+            label_vocab: A mapping from span labels to integer indices.
+        """
+        self.label_vocab = label_vocab
+        self.label_from_index = {i: label for label, i in label_vocab.items()}
+        self.force_root_constituent = force_root_constituent
+    @staticmethod
+    def build_vocab(trees):
+        label_set = set()
+        for tree in trees:
+            for _, _, label in get_labeled_spans(tree):
+                if label:
+                    label_set.add(label)
+        label_set = [""] + sorted(label_set)
+        return {label: i for i, label in enumerate(label_set)}
+    @staticmethod
+    def infer_force_root_constituent(trees):
+        for tree in trees:
+            for _, _, label in get_labeled_spans(tree):
+                if not label:
+                    return False
+        return True
+    def chart_from_tree(self, tree):
+        spans = get_labeled_spans(tree)
+        num_words = len(tree.leaves())
+        chart = np.full((num_words, num_words), -100, dtype=int)
+        chart = np.tril(chart, -1)
+        # Now all invalid entries are filled with -100, and valid entries with 0
+        for start, end, label in spans:
+            # Previously unseen unary chains can occur in the dev/test sets.
+            # For now, we ignore them and don't mark the corresponding chart
+            # entry as a constituent.
+            if label in self.label_vocab:
+                chart[start, end] = self.label_vocab[label]
+        return chart
+    def charts_from_pytorch_scores_batched(self, scores, lengths):
+        """Runs CKY to recover span labels from scores (e.g. logits).
+        This method uses pytorch-struct to speed up decoding compared to the
+        pure-Python implementation of CKY used by tree_from_scores().
+        Args:
+            scores: a pytorch tensor of shape (batch size, max length,
+                max length, label vocab size).
+            lengths: a pytorch tensor of shape (batch size,)
+        Returns:
+            A list of numpy arrays, each of shape (sentence length, sentence
+                length).
+        """
+        scores = scores.detach()
+        scores = scores - scores[..., :1]
+        if self.force_root_constituent:
+            scores[torch.arange(scores.shape[0]), 0, lengths - 1, 0] -= 1e9
+        dist = torch_struct.TreeCRF(scores, lengths=lengths)
+        amax = dist.argmax
+        amax[..., 0] += 1e-9
+        padded_charts = amax.argmax(-1)
+        padded_charts = padded_charts.detach().cpu().numpy()
+        return [
+            chart[:length, :length] for chart, length in zip(padded_charts, lengths)
+        ]
+    def compressed_output_from_chart(self, chart):
+        chart_with_filled_diagonal = chart.copy()
+        np.fill_diagonal(chart_with_filled_diagonal, 1)
+        chart_with_filled_diagonal[0, -1] = 1
+        starts, inclusive_ends = np.where(chart_with_filled_diagonal)
+        preorder_sort = np.lexsort((-inclusive_ends, starts))
+        starts = starts[preorder_sort]
+        inclusive_ends = inclusive_ends[preorder_sort]
+        labels = chart[starts, inclusive_ends]
+        ends = inclusive_ends + 1
+        return CompressedParserOutput(starts=starts, ends=ends, labels=labels)
+    def tree_from_chart(self, chart, leaves):
+        compressed_output = self.compressed_output_from_chart(chart)
+        return compressed_output.to_tree(leaves, self.label_from_index)
+    def tree_from_scores(self, scores, leaves):
+        """Runs CKY to decode a tree from scores (e.g. logits).
+        If speed is important, consider using charts_from_pytorch_scores_batched
+        followed by compressed_output_from_chart or tree_from_chart instead.
+        Args:
+            scores: a chart of scores (or logits) of shape
+                (sentence length, sentence length, label vocab size). The first
+                two dimensions may be padded to a longer length, but all padded
+                values will be ignored.
+            leaves: the leaf nodes to use in the constructed tree. These
+                may be of type str or nltk.Tree, or (word, tag) tuples that
+                will be used to construct the leaf node objects.
+        Returns:
+            An nltk.Tree object.
+        """
+        leaves = [
+            nltk.Tree(node[1], [node[0]]) if isinstance(node, tuple) else node
+            for node in leaves
+        ]
+        chart = {}
+        scores = scores - scores[:, :, 0, None]
+        for length in range(1, len(leaves) + 1):
+            for left in range(0, len(leaves) + 1 - length):
+                right = left + length
+                label_scores = scores[left, right - 1]
+                label_scores = label_scores - label_scores[0]
+                argmax_label_index = int(
+                    label_scores.argmax()
+                    if length < len(leaves) or not self.force_root_constituent
+                    else label_scores[1:].argmax() + 1
+                )
+                argmax_label = self.label_from_index[argmax_label_index]
+                label = argmax_label
+                label_score = label_scores[argmax_label_index]
+                if length == 1:
+                    tree = leaves[left]
+                    if label:
+                        tree = nltk.tree.Tree(label, [tree])
+                    chart[left, right] = [tree], label_score
+                    continue
+                best_split = max(
+                    range(left + 1, right),
+                    key=lambda split: (chart[left, split][1] + chart[split, right][1]),
+                )
+                left_trees, left_score = chart[left, best_split]
+                right_trees, right_score = chart[best_split, right]
+                children = left_trees + right_trees
+                if label:
+                    children = [nltk.tree.Tree(label, children)]
+                chart[left, right] = (children, label_score + left_score + right_score)
+        children, score = chart[0, len(leaves)]
+        tree = nltk.tree.Tree("TOP", children)
+        tree = uncollapse_unary(tree)
+        return tree
+class SpanClassificationMarginLoss(nn.Module):
+    def __init__(self, force_root_constituent=True, reduction="mean"):
+        super().__init__()
+        self.force_root_constituent = force_root_constituent
+        if reduction not in ("none", "mean", "sum"):
+            raise ValueError(f"Invalid value for reduction: {reduction}")
+        self.reduction = reduction
+    def forward(self, logits, labels):
+        gold_event = F.one_hot(F.relu(labels), num_classes=logits.shape[-1])
+        logits = logits - logits[..., :1]
+        lengths = (labels[:, 0, :] != -100).sum(-1)
+        augment = (1 - gold_event).to(torch.float)
+        if self.force_root_constituent:
+            augment[torch.arange(augment.shape[0]), 0, lengths - 1, 0] -= 1e9
+        dist = torch_struct.TreeCRF(logits + augment, lengths=lengths)
+        pred_score = dist.max
+        gold_score = (logits * gold_event).sum((1, 2, 3))
+        margin_losses = F.relu(pred_score - gold_score)
+        if self.reduction == "none":
+            return margin_losses
+        elif self.reduction == "mean":
+            return margin_losses.mean()
+        elif self.reduction == "sum":
+            return margin_losses.sum()
+        else:
+            assert False, f"Unexpected reduction: {self.reduction}"

benepar/integrations/__init__.py ADDED Viewed

File without changes

benepar/integrations/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (208 Bytes). View file

benepar/integrations/__pycache__/__init__.cpython-37.pyc ADDED Viewed

Binary file (191 Bytes). View file

benepar/integrations/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (173 Bytes). View file

benepar/integrations/__pycache__/downloader.cpython-310.pyc ADDED Viewed

Binary file (1.35 kB). View file

benepar/integrations/__pycache__/downloader.cpython-37.pyc ADDED Viewed

Binary file (1.31 kB). View file

benepar/integrations/__pycache__/downloader.cpython-38.pyc ADDED Viewed

Binary file (1.31 kB). View file

benepar/integrations/__pycache__/nltk_plugin.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

benepar/integrations/__pycache__/nltk_plugin.cpython-37.pyc ADDED Viewed

Binary file (11.1 kB). View file

benepar/integrations/__pycache__/nltk_plugin.cpython-38.pyc ADDED Viewed

Binary file (11.2 kB). View file

benepar/integrations/__pycache__/spacy_extensions.cpython-310.pyc ADDED Viewed

Binary file (4.44 kB). View file

benepar/integrations/__pycache__/spacy_extensions.cpython-37.pyc ADDED Viewed

Binary file (4.32 kB). View file

benepar/integrations/__pycache__/spacy_extensions.cpython-38.pyc ADDED Viewed

Binary file (4.36 kB). View file

benepar/integrations/__pycache__/spacy_plugin.cpython-310.pyc ADDED Viewed

Binary file (6.63 kB). View file