nlpia-rnn / app.py
hobs
torchreq
6b0c8c6
# app.py
import gradio as gr
import json
from pathlib import Path
# import random
# import time
import string
import torch
import torch.nn as nn
import unicodedata
from unidecode import unidecode
ASCII_LETTERS = string.ascii_letters
ASCII_PRINTABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
ASCII_PRINTABLE_COMMON = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r'
ASCII_VERTICAL_TAB = '\x0b'
ASCII_PAGE_BREAK = '\x0c'
ASCII_ALL = ''.join(chr(i) for i in range(0, 128)) # ASCII_PRINTABLE
ASCII_DIGITS = string.digits
ASCII_IMPORTANT_PUNCTUATION = " .?!,;'-=+)(:"
ASCII_NAME_PUNCTUATION = " .,;'-"
ASCII_NAME_CHARS = set(ASCII_LETTERS + ASCII_NAME_PUNCTUATION)
ASCII_IMPORTANT_CHARS = set(ASCII_LETTERS + ASCII_IMPORTANT_PUNCTUATION)
CURLY_SINGLE_QUOTES = '‘’`´'
STRAIGHT_SINGLE_QUOTES = "'" * len(CURLY_SINGLE_QUOTES)
CURLY_DOUBLE_QUOTES = '“”'
STRAIGHT_DOUBLE_QUOTES = '"' * len(CURLY_DOUBLE_QUOTES)
def normalize_newlines(s):
s = s.replace(ASCII_VERTICAL_TAB, '\n')
s = s.replace(ASCII_PAGE_BREAK, '\n\n')
class Asciifier:
""" Construct a function that filters out all non-ascii unicode characters
>>> test_str = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
>>> Asciifier(include='a b c 123XYZ')(test_str):
'123abcXYZ '
"""
def __init__(
self,
min_ord=1, max_ord=128,
exclude=None,
include=ASCII_PRINTABLE,
exclude_category='Mn',
normalize_quotes=True,
):
self.include = set(sorted(include or ASCII_PRINTABLE))
self._include = ''.join(sorted(self.include))
self.exclude = exclude or set()
self.exclude = set(sorted(exclude or []))
self._exclude = ''.join(self.exclude)
self.min_ord, self.max_ord = int(min_ord), int(max_ord or 128)
self.normalize_quotes = normalize_quotes
if self.min_ord:
self.include = set(c for c in self.include if ord(c) >= self.min_ord)
if self.max_ord:
self.include = set(c for c in self._include if ord(c) <= self.max_ord)
if exclude_category:
self.include = set(
c for c in self._include if unicodedata.category(c) != exclude_category)
self.vocab = sorted(self.include - self.exclude)
self._vocab = ''.join(self.vocab)
self.char2i = {c: i for (i, c) in enumerate(self._vocab)}
self._translate_from = self._vocab
self._translate_to = self._translate_from
# FIXME: self.normalize_quotes is accomplished by unidecode.unidecode!!
# ’->' ‘->' “->" ”->"
if self.normalize_quotes:
trans_table = str.maketrans(
CURLY_SINGLE_QUOTES + CURLY_DOUBLE_QUOTES,
STRAIGHT_SINGLE_QUOTES + STRAIGHT_DOUBLE_QUOTES)
self._translate_to = self._translate_to.translate(trans_table)
# print(self._translate_to)
# eliminate any non-translations (if from == to)
self._translate_from_filtered = ''
self._translate_to_filtered = ''
for c1, c2 in zip(self._translate_from, self._translate_to):
if c1 == c2:
continue
else:
self._translate_from_filtered += c1
self._translate_to_filtered += c2
self._translate_del = ''
for c in ASCII_ALL:
if c not in self.vocab:
self._translate_del += c
self._translate_from = self._translate_from_filtered
self._translate_to = self._translate_to_filtered
self.translation_table = str.maketrans(
self._translate_from,
self._translate_to,
self._translate_del)
def __call__(self, text):
return unidecode(unicodedata.normalize('NFD', text)).translate(self.translation_table)
name_char_vocab_size = len(ASCII_NAME_CHARS) + 1 # Plus EOS marker
# Transcode Unicode str ASCII without embelishments, diacritics (https://stackoverflow.com/a/518232/2809427)
asciify = Asciifier(include=ASCII_NAME_CHARS)
def find_files(path, pattern):
return Path(path).glob(pattern)
# all_letters = ''.join(set(ASCII_NAME_CHARS).union(set(" .,;'")))
char2i = {c: i for i, c in enumerate(ASCII_NAME_CHARS)}
# !curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip
print(f'asciify("O’Néàl") => {asciify("O’Néàl")}')
categories = json.load(open('categories.json'))
n_categories = len(categories)
######################################################################
# Turning Names into Tensors
# --------------------------
#
# Now that we have all the names organized, we need to turn them into
# Tensors to make any use of them.
#
# To represent a single letter, we use a "one-hot vector" of size
# ``<1 x n_letters>``. A one-hot vector is filled with 0s except for a 1
# at index of the current letter, e.g. ``"b" = <0 1 0 0 0 ...>``.
#
# To make a word we join a bunch of those into a 2D matrix
# ``<line_length x 1 x n_letters>``.
#
# That extra 1 dimension is because PyTorch assumes everything is in
# batches - we're just using a batch size of 1 here.
#
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(c):
return char2i[c]
# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def encode_one_hot_vec(letter):
tensor = torch.zeros(1, len(ASCII_NAME_CHARS))
tensor[0][letterToIndex(letter)] = 1
return tensor
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def encode_one_hot_seq(line):
tensor = torch.zeros(len(line), 1, len(ASCII_NAME_CHARS))
for li, letter in enumerate(line):
tensor[li][0][letterToIndex(letter)] = 1
return tensor
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, char_tens, hidden):
combined = torch.cat((char_tens, hidden), 1)
hidden = self.i2h(combined)
output = self.i2o(combined)
output = self.softmax(output)
return output, hidden
def initHidden(self):
return torch.zeros(1, self.hidden_size)
n_hidden = 128
rnn = RNN(len(ASCII_NAME_CHARS), n_hidden, n_categories)
input = encode_one_hot_vec('A')
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input, hidden)
def categoryFromOutput(output):
top_n, top_i = output.topk(1)
category_i = top_i[0].item()
return categories[category_i], category_i
def output_from_str(s):
global rnn
input = encode_one_hot_seq(s)
hidden = torch.zeros(1, n_hidden)
output, next_hidden = rnn(input[0], hidden)
print(output)
return categoryFromOutput(output)
########################################
# load/save test for use on the huggingface spaces server
# torch.save(rnn.state_dict(), 'rnn_from_scratch_name_nationality.state_dict.pickle')
state_dict = torch.load('rnn_from_scratch_name_nationality.state_dict.pickle')
rnn.load_state_dict(state_dict)
def evaluate(line_tensor):
hidden = rnn.initHidden()
for i in range(line_tensor.size()[0]):
output, hidden = rnn(line_tensor[i], hidden)
return output
def predict(input_line, n_predictions=3):
print('\n> %s' % input_line)
with torch.no_grad():
output = evaluate(encode_one_hot_seq(input_line))
# Get top N categories
topv, topi = output.topk(n_predictions, 1, True)
predictions = []
for i in range(n_predictions):
value = topv[0][i].item()
category_index = topi[0][i].item()
print('(%.2f) %s' % (value, categories[category_index]))
predictions.append([value, categories[category_index]])
predict('Dovesky')
predict('Jackson')
predict('Satoshi')
# load/save test for use on the huggingface spaces server
########################################
def greet_nationality(name):
nationality = predict(name)
return f"Hello {name}!!\n Your name seems to be from {nationality}. Am I right?"
iface = gr.Interface(fn=greet_nationality, inputs="text", outputs="text")
iface.launch()