Spaces:

haryoaw
/

id-recigen

Runtime error

App Files Files Community

haryoaw commited on Apr 16, 2022

Commit

acd7000

•

1 Parent(s): 570233d

initial commit

Browse files

Files changed (6) hide show

.gitignore +2 -0
app.py +78 -0
gradio.py +40 -0
requirements.txt +4 -0
src/__init__.py +0 -0
src/tokenizers.py +239 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .mypy_cache
2	+ __pycache__

app.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""
+Main App
+"""
+import streamlit as st
+from transformers import AutoModelForSeq2SeqLM
+from src.tokenizers import IndoNLGTokenizer
+@st.cache(allow_output_mutation=True)
+def fetch_tokenizer_model():
+    """
+    Fetch tokenizer and model
+    """
+    tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
+    model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
+    return tokenizer, model
+tokenizer, model = fetch_tokenizer_model()
+def predict_recipe(food: str) -> str:
+    """
+    Predict Ingredients Here!
+    Parameters
+    ----------
+    food: str
+        The food that will be used
+    Returns
+    -------
+    str
+        Return the model here
+    """
+    inp = tokenizer(food.lower(), return_tensors="pt")["input_ids"]
+    generated = model.generate(
+        inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
+    )
+    returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
+    returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
+    return returned_input
+def create_frontend() -> None:
+    """
+    Create front end streamlit here
+    """
+    st.markdown("# Food Ingredients Generator Indonesia Showcase!")
+    st.write("🥑Generate your ingredients here!")
+    with st.form("my_form"):
+        food_name = st.text_input(
+            "Food", value="Nasi Goreng Ayam", help="Input your food here!"
+        )
+        submitted = st.form_submit_button("Submit")
+        if submitted:
+            predicted = predict_recipe(food_name)
+            st.markdown(f"## Bahan ( Ingredients ) `{food_name}`:")
+            st.text(predicted)
+    st.markdown("## Additional Note")
+    st.write(
+        "❗Please note that the model is trained with the food that use:"
+    )
+    for i, ingr in enumerate(("ayam", "tempe", "ikan", "kambing", "telur", "tahu", "sapi")):
+        st.write(f"{i+1}. {ingr}")
+    st.markdown("## Models")
+    st.markdown(
+        "🤗 Huggingface Model: [Link](https://huggingface.co/haryoaw/id-recigen-bart)"
+    )
+    st.write("Thank you 😊")
+if __name__ == "__main__":
+    create_frontend()

gradio.py ADDED Viewed

	@@ -0,0 +1,40 @@

+"""
+Main App
+"""
+import gradio as gr
+from transformers import AutoModelForSeq2SeqLM
+from src.tokenizers import IndoNLGTokenizer
+tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indobart-v2")
+model = AutoModelForSeq2SeqLM.from_pretrained("haryoaw/id-recigen-bart")
+def predict_recipe(food: str) -> str:
+    """
+    Predict Ingredients Here!
+    Parameters
+    ----------
+    food: str
+        The food that will be used
+    """
+    inp = tokenizer(food, return_tensors="pt")["input_ids"]
+    generated = model.generate(
+        inp, max_length=500, do_sample=False, num_beams=10, num_beam_groups=2
+    )
+    returned_input: str = tokenizer.decode(generated[0], skip_special_tokens=True)
+    returned_input = "\n".join([x.strip() for x in returned_input.split("||")])
+    return returned_input
+iface = gr.Interface(
+    fn=predict_recipe,
+    inputs=[gr.inputs.Textbox(placeholder="Food Name")],
+    outputs="textbox",
+)
+if __name__ == "__main__":
+    app, local_url, share_url = iface.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+sentencepiece>=0.1.95
+transformers
+torch
+streamlit==1.8.1

src/__init__.py ADDED Viewed

File without changes

src/tokenizers.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+""" Tokenization classes for IndoNLG model."""
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+from transformers import PreTrainedTokenizer
+import sentencepiece as spm
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "indobart": "https://huggingface.co/indobart/resolve/main/sentencepiece.bpe.model",
+        "indogpt": "https://huggingface.co/indogptresolve/main/sentencepiece.bpe.model",
+        "indobart-v2": "https://huggingface.co/indobart-v2/resolve/main/sentencepiece.bpe.model"
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "indobenchmark/indobart": 768,
+    "ndobenchmark/indogpt": 768,
+    "indobenchmark/indobart-v2": 768
+}
+SHARED_MODEL_IDENTIFIERS = [
+    # Load with
+    "indobenchmark/indobart",
+    "indobenchmark/indogpt",
+    "indobenchmark/indobart-v2"
+]
+SPIECE_UNDERLINE = "▁"
+class IndoNLGTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids","attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        decode_special_token=True,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        additional_special_tokens=["[java]","[sunda]","[indonesia]","<mask>"],
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+        self.decode_special_token = decode_special_token
+        self.model_max_length = 1024
+        # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
+        # sentencepiece vocabulary (this is the case for <s> and </s>
+        self.special_tokens_to_ids = {
+            "[java]": 40000,
+            "[sunda]": 40001,
+            "[indonesia]": 40002,
+            "<mask>": 40003
+        }
+        self.special_ids_to_tokens = {v: k for k, v in self.special_tokens_to_ids.items()}
+        # Store Language token ID
+        self.javanese_token = '[javanese]'
+        self.javanese_token_id = 40000
+        self.sundanese_token = '[sundanese]'
+        self.sundanese_token_id = 40001
+        self.indonesian_token = '[indonesia]'
+        self.indonesian_token_id = 40002
+        self.special_token_ids = [
+            self.bos_token_id, self.eos_token_id, self.sep_token_id, self.cls_token_id,
+            self.unk_token_id, self.pad_token_id, self.mask_token_id,
+            self.javanese_token_id, self.sundanese_token_id, self.indonesian_token_id
+        ]
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An CamemBERT sequence has the following format:
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like
+        RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+    @property
+    def vocab_size(self):
+        return 4 + len(self.sp_model)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens_to_ids:
+            return self.special_tokens_to_ids[token]
+        return self.sp_model.PieceToId(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if not self.decode_special_token and index in self.special_token_ids:
+            return ''
+        if index in self.special_ids_to_tokens:
+            return self.special_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index)
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        return self.sp_model.decode(tokens)
+    def decode(self, inputs, skip_special_tokens=False):
+        prev_val = self.decode_special_token
+        self.decode_special_token = not skip_special_tokens
+        outputs = super().decode(inputs, skip_special_tokens=skip_special_tokens)
+        self.decode_special_token = prev_val
+        return outputs