Liam Dyer commited on
Commit
9a1c39c
1 Parent(s): 7ca6619

feat: support filename input

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import spaces
3
  import subprocess
4
  import os
 
5
  import string
6
  import random
7
  from pypdf import PdfReader
@@ -57,7 +58,10 @@ def extract_metadata_from_pdf(reader):
57
  }
58
 
59
 
60
- def convert_pandoc(input_file):
 
 
 
61
  # Convert the file to markdown with pandoc
62
  output_file = f"{random_word(16)}.md"
63
  result = subprocess.call(
@@ -66,31 +70,34 @@ def convert_pandoc(input_file):
66
  if result != 0:
67
  raise ValueError("Error converting file to markdown with pandoc")
68
 
69
- # Read the file and delete
70
  with open(output_file, "r") as f:
71
  markdown = f.read()
72
  os.remove(output_file)
 
73
 
74
  return markdown
75
 
76
 
77
  @spaces.GPU
78
- def convert(input_file):
79
  plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
80
  # Already a plain text file that wouldn't benefit from pandoc so return the content
81
- if any(input_file.endswith(ft) for ft in plain_text_filetypes):
82
  with open(input_file, "r") as f:
83
  return f.read(), {}
84
 
85
- if input_file.endswith(".pdf"):
86
  return convert_pdf(input_file)
87
 
88
- return convert_pandoc(input_file), {}
89
 
90
 
 
 
91
  gr.Interface(
92
  convert,
93
- inputs=gr.File(label="Upload File", type="filepath"),
94
  outputs=[
95
  gr.Text(label="Markdown"),
96
  gr.JSON(label="Metadata"),
 
2
  import spaces
3
  import subprocess
4
  import os
5
+ import shutil
6
  import string
7
  import random
8
  from pypdf import PdfReader
 
58
  }
59
 
60
 
61
+ def convert_pandoc(input_file, filename):
62
+ # Temporarily copy the file
63
+ shutil.copyfile(input_file, filename)
64
+
65
  # Convert the file to markdown with pandoc
66
  output_file = f"{random_word(16)}.md"
67
  result = subprocess.call(
 
70
  if result != 0:
71
  raise ValueError("Error converting file to markdown with pandoc")
72
 
73
+ # Read the file and delete temporary files
74
  with open(output_file, "r") as f:
75
  markdown = f.read()
76
  os.remove(output_file)
77
+ os.remove(filename)
78
 
79
  return markdown
80
 
81
 
82
  @spaces.GPU
83
+ def convert(input_file, filename):
84
  plain_text_filetypes = [".txt", ".csv", ".tsv", ".md"]
85
  # Already a plain text file that wouldn't benefit from pandoc so return the content
86
+ if any(filename.endswith(ft) for ft in plain_text_filetypes):
87
  with open(input_file, "r") as f:
88
  return f.read(), {}
89
 
90
+ if filename.endswith(".pdf"):
91
  return convert_pdf(input_file)
92
 
93
+ return convert_pandoc(input_file, filename), {}
94
 
95
 
96
+ # We accept a filename because the gradio JS interface removes this information
97
+ # and it's critical for choosing the correct processing pipeline
98
  gr.Interface(
99
  convert,
100
+ inputs=[gr.File(label="Upload File", type="filepath"), gr.Text(label="Filename")],
101
  outputs=[
102
  gr.Text(label="Markdown"),
103
  gr.JSON(label="Metadata"),