Tonic commited on
Commit
9626102
1 Parent(s): ba05a34

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -40
app.py CHANGED
@@ -3,42 +3,29 @@ import re
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
  import torch
5
  import gradio as gr
6
- import difflib
7
  from concurrent.futures import ThreadPoolExecutor
8
  import os
9
 
10
- # OCR Correction Model
 
 
 
 
 
 
11
  model_name = "PleIAs/OCRonos-Vintage"
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
-
14
- # Load pre-trained model and tokenizer
15
  model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
16
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
17
 
18
- # CSS for formatting
19
- css = """
20
- <style>
21
- .generation {
22
- margin-left: 2em;
23
- margin-right: 2em;
24
- font-size: 1.2em;
25
- }
26
- .inserted {
27
- background-color: #90EE90;
28
- }
29
- </style>
30
- """
31
-
32
- def generate_html_diff(old_text, new_text):
33
- d = difflib.Differ()
34
- diff = list(d.compare(old_text.split(), new_text.split()))
35
- html_diff = []
36
- for word in diff:
37
- if word.startswith(' '):
38
- html_diff.append(word[2:])
39
- elif word.startswith('+ '):
40
- html_diff.append(f'<span class="inserted">{word[2:]}</span>')
41
- return ' '.join(html_diff)
42
 
43
  def split_text(text, max_tokens=400):
44
  tokens = tokenizer.tokenize(text)
@@ -86,18 +73,22 @@ def process_text(user_message):
86
  corrected_chunks.append(corrected_chunk)
87
 
88
  corrected_text = ' '.join(corrected_chunks)
89
- html_diff = generate_html_diff(user_message, corrected_text)
90
-
91
- ocr_result = f'<h2 style="text-align:center">OCR Correction</h2>\n<div class="generation">{html_diff}</div>'
92
- final_output = f"{css}{ocr_result}"
93
- return final_output
94
-
95
- # Define the Gradio interface
96
- with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
97
- gr.HTML("""<h1 style="text-align:center">Vintage OCR corrector (CPU)</h1>""")
98
- text_input = gr.Textbox(label="Your (bad?) text", type="text", lines=5)
99
- process_button = gr.Button("Process Text")
100
- text_output = gr.HTML(label="Processed text")
 
 
 
 
101
  process_button.click(process_text, inputs=text_input, outputs=[text_output])
102
 
103
  if __name__ == "__main__":
 
3
  from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
  import torch
5
  import gradio as gr
6
+ from difflib import Differ
7
  from concurrent.futures import ThreadPoolExecutor
8
  import os
9
 
10
+ description = """# 🙋🏻‍♂️Welcome to Tonic's On-Device📲⌚🎅🏻OCR Corrector (CPU)
11
+ 📲⌚🎅🏻OCRonos-Vintage is a small specialized model for OCR correction of cultural heritage archives pre-trained with llm.c. OCRonos-Vintage is only 124 million parameters. It can run easily on CPU or provide correction at scale on GPUs (>10k tokens/seconds) while providing a quality of correction comparable to GPT-4 or the llama version of OCRonos for English-speaking cultural archives.
12
+
13
+ ### Join us :
14
+ 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
15
+ """
16
+
17
  model_name = "PleIAs/OCRonos-Vintage"
18
  device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ 🙋🏻‍♂️Welcome to Tonic's ⌚🎅🏻Vintage OCRonos Corrector (CPU)
 
20
  model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
21
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
22
 
23
+ def diff_texts(text1, text2):
24
+ d = Differ()
25
+ return [
26
+ (token[2:], token[0] if token[0] != " " else None)
27
+ for token in d.compare(text1.split(), text2.split())
28
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def split_text(text, max_tokens=400):
31
  tokens = tokenizer.tokenize(text)
 
73
  corrected_chunks.append(corrected_chunk)
74
 
75
  corrected_text = ' '.join(corrected_chunks)
76
+ return diff_texts(user_message, corrected_text)
77
+
78
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
79
+ gr.MarkDown(description)
80
+ text_input = gr.Textbox(
81
+ label="↘️Enter 👁️OCR'ed Text Outputs Here",
82
+ info="""Hi there, ;fémy name à`gis tonic 45and i like to ride my vpotz""",
83
+ lines=5,
84
+ )
85
+ process_button = gr.Button("Correct using 📲⌚🎅🏻OCRonos")
86
+ text_output = gr.HighlightedText(
87
+ label="📲⌚🎅🏻OCRonos Correction:",
88
+ combine_adjacent=True,
89
+ show_legend=True,
90
+ color_map={"+": "green", "-": "red"}
91
+ )
92
  process_button.click(process_text, inputs=text_input, outputs=[text_output])
93
 
94
  if __name__ == "__main__":