OCRonos-TextCorrect

Sleeping

Pclanglais commited on Aug 4

Commit

61dc098

•

1 Parent(s): dfbcb2e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ import shutil
 import requests
 import pandas as pd
 import difflib
 # OCR Correction Model
 ocr_model_name = "PleIAs/OCRonos-Vintage"
@@ -162,22 +163,26 @@ def split_text(text, max_tokens=500):
 # Function to generate text
-@spaces.GPU
-def ocr_correction(prompt, max_new_tokens=500):
-    model.to(device)
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
     # Generate text
-    output = model.generate(input_ids,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.eos_token_id,
             top_k=50,
             num_return_sequences=1,
-            do_sample=True,
-            temperature=0.7
         )
     # Decode and return the generated text
     result = tokenizer.decode(output[0], skip_special_tokens=True)
     print(result)

 import requests
 import pandas as pd
 import difflib
+from concurrent.futures import ThreadPoolExecutor
 # OCR Correction Model
 ocr_model_name = "PleIAs/OCRonos-Vintage"
 # Function to generate text
+ def ocr_correction(prompt, max_new_tokens=600, num_threads=os.cpu_count()):
     prompt = f"""### Text ###\n{prompt}\n\n\n### Correction ###\n"""
     input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
+    # Set the number of threads for PyTorch
+    torch.set_num_threads(num_threads)
     # Generate text
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        future = executor.submit(
+            model.generate,
+            input_ids,
             max_new_tokens=max_new_tokens,
             pad_token_id=tokenizer.eos_token_id,
             top_k=50,
             num_return_sequences=1,
+            do_sample=False
         )
+        output = future.result()
     # Decode and return the generated text
     result = tokenizer.decode(output[0], skip_special_tokens=True)
     print(result)