OCRonos-TextCorrect

Sleeping

App Files Files Community

Pclanglais commited on Aug 5

Commit

e98a756

•

1 Parent(s): e2b4df4

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -157

app.py CHANGED Viewed

@@ -1,28 +1,18 @@
-import spaces
 import transformers
 import re
-from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
 import torch
 import gradio as gr
-import json
-import os
-import shutil
-import requests
-import pandas as pd
 import difflib
 from concurrent.futures import ThreadPoolExecutor
 # OCR Correction Model
-ocr_model_name = "PleIAs/OCRonos-Vintage"
-import torch
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load pre-trained model and tokenizer
-model_name = "PleIAs/OCRonos-Vintage"
-model = GPT2LMHeadModel.from_pretrained(model_name)
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
 # CSS for formatting
@@ -33,78 +23,12 @@ css = """
     margin-right: 2em;
     font-size: 1.2em;
 }
-:target {
-    background-color: #CCF3DF;
-}
-.source {
-    float: left;
-    max-width: 17%;
-    margin-left: 2%;
-}
-.tooltip {
-    position: relative;
-    cursor: pointer;
-    font-variant-position: super;
-    color: #97999b;
-}
-.tooltip:hover::after {
-    content: attr(data-text);
-    position: absolute;
-    left: 0;
-    top: 120%;
-    white-space: pre-wrap;
-    width: 500px;
-    max-width: 500px;
-    z-index: 1;
-    background-color: #f9f9f9;
-    color: #000;
-    border: 1px solid #ddd;
-    border-radius: 5px;
-    padding: 5px;
-    display: block;
-    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
-}
-.deleted {
-    background-color: #ffcccb;
-    text-decoration: line-through;
-}
 .inserted {
     background-color: #90EE90;
 }
-.manuscript {
-    display: flex;
-    margin-bottom: 10px;
-    align-items: baseline;
-}
-.annotation {
-    width: 15%;
-    padding-right: 20px;
-    color: grey !important;
-    font-style: italic;
-    text-align: right;
-}
-.content {
-    width: 80%;
-}
-h2 {
-    margin: 0;
-    font-size: 1.5em;
-}
-.title-content h2 {
-    font-weight: bold;
-}
-.bibliography-content {
-    color: darkgreen !important;
-    margin-top: -5px;
-}
-.paratext-content {
-    color: #a4a4a4 !important;
-    margin-top: -5px;
-}
 </style>
 """
-# Helper functions
 def generate_html_diff(old_text, new_text):
     d = difflib.Differ()
     diff = list(d.compare(old_text.split(), new_text.split()))
@@ -113,64 +37,31 @@ def generate_html_diff(old_text, new_text):
         if word.startswith(' '):
             html_diff.append(word[2:])
         elif word.startswith('+ '):
-            html_diff.append(f'<span style="background-color: #90EE90;">{word[2:]}</span>')
     return ' '.join(html_diff)
-def preprocess_text(text):
-    text = re.sub(r'<[^>]+>', '', text)
-    text = re.sub(r'\n', ' ', text)
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-def split_text(text, max_tokens=500):
-    parts = text.split("\n")
     chunks = []
-    current_chunk = ""
-    for part in parts:
-        if current_chunk:
-            temp_chunk = current_chunk + "\n" + part
-        else:
-            temp_chunk = part
-        num_tokens = len(tokenizer.tokenize(temp_chunk))
-        if num_tokens <= max_tokens:
-            current_chunk = temp_chunk
-        else:
-            if current_chunk:
-                chunks.append(current_chunk)
-            current_chunk = part
     if current_chunk:
-        chunks.append(current_chunk)
-    if len(chunks) == 1 and len(tokenizer.tokenize(chunks[0])) > max_tokens:
-        long_text = chunks[0]
-        chunks = []
-        while len(tokenizer.tokenize(long_text)) > max_tokens:
-            split_point = len(long_text) // 2
-            while split_point < len(long_text) and not re.match(r'\s', long_text[split_point]):
-                split_point += 1
-            if split_point >= len(long_text):
-                split_point = len(long_text) - 1
-            chunks.append(long_text[:split_point].strip())
-            long_text = long_text[split_point:].strip()
-        if long_text:
-            chunks.append(long_text)
     return chunks
-# Function to generate text
 def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
-    # Set the number of threads for PyTorch
     torch.set_num_threads(num_threads)
-    # Generate text
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
         future = executor.submit(
             model.generate,
@@ -183,41 +74,23 @@ def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
         )
         output = future.result()
-    # Decode and return the generated text
     result = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(result)
-    result = result.split("### Correction ###")[1]
-    return result
-# OCR Correction Class
-class OCRCorrector:
-    def __init__(self, system_prompt="Le dialogue suivant est une conversation"):
-        self.system_prompt = system_prompt
-    def correct(self, user_message):
-        generated_text = ocr_correction(user_message)
-        html_diff = generate_html_diff(user_message, generated_text)
-        return generated_text, html_diff
-# Combined Processing Class
-class TextProcessor:
-    def __init__(self):
-        self.ocr_corrector = OCRCorrector()
-    @spaces.GPU(duration=120)
-    def process(self, user_message):
-        #OCR Correction
-        corrected_text, html_diff = self.ocr_corrector.correct(user_message)
-        # Combine results
-        ocr_result = f'<h2 style="text-align:center">OCR Correction</h2>\n<div class="generation">{html_diff}</div>'
-        final_output = f"{css}{ocr_result}"
-        return final_output
-# Create the TextProcessor instance
-text_processor = TextProcessor()
 # Define the Gradio interface
 with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
@@ -225,7 +98,7 @@ with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
     text_input = gr.Textbox(label="Your (bad?) text", type="text", lines=5)
     process_button = gr.Button("Process Text")
     text_output = gr.HTML(label="Processed text")
-    process_button.click(text_processor.process, inputs=text_input, outputs=[text_output])
 if __name__ == "__main__":
     demo.queue().launch()

 import transformers
 import re
+from transformers import GPT2LMHeadModel, GPT2Tokenizer
 import torch
 import gradio as gr
 import difflib
 from concurrent.futures import ThreadPoolExecutor
+import os
 # OCR Correction Model
+model_name = "PleIAs/OCRonos-Vintage"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load pre-trained model and tokenizer
+model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
 tokenizer = GPT2Tokenizer.from_pretrained(model_name)
 # CSS for formatting
     margin-right: 2em;
     font-size: 1.2em;
 }
 .inserted {
     background-color: #90EE90;
 }
 </style>
 """
 def generate_html_diff(old_text, new_text):
     d = difflib.Differ()
     diff = list(d.compare(old_text.split(), new_text.split()))
         if word.startswith(' '):
             html_diff.append(word[2:])
         elif word.startswith('+ '):
+            html_diff.append(f'<span class="inserted">{word[2:]}</span>')
     return ' '.join(html_diff)
+def split_text(text, max_tokens=400):
+    tokens = tokenizer.tokenize(text)
     chunks = []
+    current_chunk = []
+    for token in tokens:
+        current_chunk.append(token)
+        if len(current_chunk) >= max_tokens:
+            chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
+            current_chunk = []
     if current_chunk:
+        chunks.append(tokenizer.convert_tokens_to_string(current_chunk))
     return chunks
 def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
     torch.set_num_threads(num_threads)
     with ThreadPoolExecutor(max_workers=num_threads) as executor:
         future = executor.submit(
             model.generate,
         )
         output = future.result()
     result = tokenizer.decode(output[0], skip_special_tokens=True)
+    return result.split("### Correction ###")[1].strip()
+def process_text(user_message):
+    chunks = split_text(user_message)
+    corrected_chunks = []
+    for chunk in chunks:
+        corrected_chunk = ocr_correction(chunk)
+        corrected_chunks.append(corrected_chunk)
+    corrected_text = ' '.join(corrected_chunks)
+    html_diff = generate_html_diff(user_message, corrected_text)
+    ocr_result = f'<h2 style="text-align:center">OCR Correction</h2>\n<div class="generation">{html_diff}</div>'
+    final_output = f"{css}{ocr_result}"
+    return final_output
 # Define the Gradio interface
 with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:
     text_input = gr.Textbox(label="Your (bad?) text", type="text", lines=5)
     process_button = gr.Button("Process Text")
     text_output = gr.HTML(label="Processed text")
+    process_button.click(process_text, inputs=text_input, outputs=[text_output])
 if __name__ == "__main__":
     demo.queue().launch()