varl42 commited on
Commit
8260580
1 Parent(s): 6f6ee6e

model and comments

Browse files

1. modified model # Load BART model & tokenizer
2. added comments to the code

Files changed (1) hide show
  1. app.py +34 -8
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import torch
3
  import PyPDF2
@@ -6,14 +8,18 @@ import numpy
6
  import scipy
7
  from gtts import gTTS
8
  from io import BytesIO
9
- from transformers import BartTokenizer
10
 
 
 
11
  def extract_text(pdf_file):
12
  pdfReader = PyPDF2.PdfReader(pdf_file)
13
  pageObj = pdfReader.pages[0]
14
  return pageObj.extract_text()
15
 
16
-
 
 
17
  def summarize_text(text):
18
  sentences = text.split(". ")
19
  for i, sentence in enumerate(sentences):
@@ -23,12 +29,26 @@ def summarize_text(text):
23
  break
24
  abstract = ". ".join(sentences[start:end+1])
25
 
26
- tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
27
- summarizer = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer=tokenizer)
28
- summary = summarizer(abstract, max_length=50, min_length=50,
29
- do_sample=False)
30
- return summary[0]['summary_text']
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
 
32
  def text_to_audio(text):
33
  tts = gTTS(text, lang='en')
34
  buffer = BytesIO()
@@ -36,12 +56,18 @@ def text_to_audio(text):
36
  buffer.seek(0)
37
  return buffer.read()
38
 
 
 
 
39
  def audio_pdf(pdf_file):
40
  text = extract_text(pdf_file)
41
  summary = summarize_text(text)
42
  audio = text_to_audio(summary)
43
  return summary, audio
44
 
 
 
 
45
  inputs = gr.File()
46
  summary_text = gr.Text()
47
  audio_summary = gr.Audio()
@@ -59,4 +85,4 @@ iface = gr.Interface(
59
  ]
60
  )
61
 
62
- iface.launch()
 
1
+ # Import libraries
2
+
3
  import gradio as gr
4
  import torch
5
  import PyPDF2
 
8
  import scipy
9
  from gtts import gTTS
10
  from io import BytesIO
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
12
 
13
+ # Function to extract text from PDF
14
+ # Defines a function to extract raw text from a PDF file
15
  def extract_text(pdf_file):
16
  pdfReader = PyPDF2.PdfReader(pdf_file)
17
  pageObj = pdfReader.pages[0]
18
  return pageObj.extract_text()
19
 
20
+
21
+ # Function to summarize text
22
+ # Defines a function to summarize the extracted text using facebook/bart-large-cnn
23
  def summarize_text(text):
24
  sentences = text.split(". ")
25
  for i, sentence in enumerate(sentences):
 
29
  break
30
  abstract = ". ".join(sentences[start:end+1])
31
 
32
+ # Load BART model & tokenizer
33
+ tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
34
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
35
+
36
+ # Tokenize abstract
37
+ inputs = tokenizer(abstract, return_tensors="pt", truncation=True)
38
+
39
+ # Generate summary
40
+ summary_ids = model.generate(inputs['input_ids'],
41
+ num_beams=3,
42
+ max_length=50,
43
+ min_length=30,
44
+ do_sample=False,
45
+ early_stopping=True)
46
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
47
+
48
+ return summary
49
 
50
+ # Function to convert text to audio
51
+ # Defines a function to convert text to an audio file using Google Text-to-Speech
52
  def text_to_audio(text):
53
  tts = gTTS(text, lang='en')
54
  buffer = BytesIO()
 
56
  buffer.seek(0)
57
  return buffer.read()
58
 
59
+ ### Main function
60
+ ### The main function that ties everything together:
61
+ ### extracts text, summarizes, and converts to audio.
62
  def audio_pdf(pdf_file):
63
  text = extract_text(pdf_file)
64
  summary = summarize_text(text)
65
  audio = text_to_audio(summary)
66
  return summary, audio
67
 
68
+ # Define Gradio interface
69
+ # Gradio web interface with a file input, text output to display the summary
70
+ # and audio output to play the audio file. # Launches the interface
71
  inputs = gr.File()
72
  summary_text = gr.Text()
73
  audio_summary = gr.Audio()
 
85
  ]
86
  )
87
 
88
+ iface.launch() # Launch the interface