TuanScientist commited on
Commit
57949d9
1 Parent(s): af45611

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -23
app.py CHANGED
@@ -13,15 +13,16 @@ import underthesea
13
  senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
14
  senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)
15
 
16
-
17
- def segmentation(sentences):
18
  segmented_sentences = []
19
  for sentence in sentences:
20
- segmented_sentence = underthesea.word_tokenize(sentence)
21
- segmented_sentences.append(' '.join(segmented_sentence))
 
 
22
  return segmented_sentences
23
 
24
-
25
  def analyze(sentence):
26
  input_ids = torch.tensor([senti_tokenizer.encode(sentence)])
27
  with torch.no_grad():
@@ -29,22 +30,19 @@ def analyze(sentence):
29
  results = out.logits.softmax(dim=-1).tolist()
30
  return results[0]
31
 
32
-
33
  def read_file(docx):
34
  try:
35
  text = docx2txt.process(docx)
36
- lines = text.split('\n')
37
- lines = [line.strip() for line in lines]
38
- lines = [line for line in lines if line]
39
- return lines
40
  except Exception as e:
41
  print(f"Error reading file: {e}")
42
 
43
-
44
  def process_file(docx):
45
- # Read the file and segment the sentences
46
- sentences = read_file(docx)
47
- segmented_sentences = segmentation(sentences)
 
 
48
 
49
  # Analyze the sentiment of each sentence
50
  results = []
@@ -53,7 +51,7 @@ def process_file(docx):
53
 
54
  # Create a DataFrame from the results
55
  df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
56
- df['Text'] = read_file(docx)
57
 
58
  # Generate the pie chart and excel file
59
  pie_chart_name = generate_pie_chart(df)
@@ -61,17 +59,16 @@ def process_file(docx):
61
 
62
  return excel_file_path, pie_chart_name
63
 
64
-
65
  def analyze_text(text, docx_file):
66
  if text:
67
- # Perform analysis on the text
68
- segmented_text = segmentation([text])
69
  results = []
70
  for sentence in segmented_text:
71
  results.append(analyze(sentence))
72
 
73
  df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
74
- df['Text'] = [text]
75
  pie_chart_name = generate_pie_chart(df)
76
  excel_file_path = generate_excel_file(df)
77
  return excel_file_path, pie_chart_name
@@ -83,7 +80,6 @@ def analyze_text(text, docx_file):
83
  # No input provided
84
  return None
85
 
86
-
87
  def generate_pie_chart(df):
88
  # Calculate the average scores
89
  neg_avg = df['Negative'].mean()
@@ -101,14 +97,13 @@ def generate_pie_chart(df):
101
  plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%')
102
  plt.title('Average Scores by Sentiment')
103
 
104
- # Save the pie chart as an image file in the static folder
105
  pie_chart_name = 'pie_chart.png'
106
  plt.savefig(pie_chart_name)
107
  plt.close()
108
 
109
  return pie_chart_name
110
 
111
-
112
  def generate_excel_file(df):
113
  # Create a new workbook and worksheet
114
  wb = openpyxl.Workbook()
@@ -158,7 +153,6 @@ def generate_excel_file(df):
158
 
159
  return excel_file_path
160
 
161
-
162
  inputs = [
163
  gr.Textbox(label="Nhập Văn Bản bằng Tiếng Việt để trải nghiệm ngay"),
164
  gr.File(label="Chọn Tệp File Word(docx) Bạn Muốn Phân Tích")
@@ -179,3 +173,4 @@ interface = gr.Interface(
179
 
180
  if __name__ == "__main__":
181
  interface.launch()
 
 
13
  senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment")
14
  senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False)
15
 
16
+ def segmentation(text):
17
+ sentences = text.split('.')
18
  segmented_sentences = []
19
  for sentence in sentences:
20
+ sentence = sentence.strip()
21
+ if sentence: # ignore empty sentences
22
+ segmented_sentence = underthesea.word_tokenize(sentence)
23
+ segmented_sentences.append(' '.join(segmented_sentence))
24
  return segmented_sentences
25
 
 
26
  def analyze(sentence):
27
  input_ids = torch.tensor([senti_tokenizer.encode(sentence)])
28
  with torch.no_grad():
 
30
  results = out.logits.softmax(dim=-1).tolist()
31
  return results[0]
32
 
 
33
  def read_file(docx):
34
  try:
35
  text = docx2txt.process(docx)
36
+ return text
 
 
 
37
  except Exception as e:
38
  print(f"Error reading file: {e}")
39
 
 
40
  def process_file(docx):
41
+ # Read the file
42
+ text = read_file(docx)
43
+
44
+ # Segment the text into sentences
45
+ segmented_sentences = segmentation(text)
46
 
47
  # Analyze the sentiment of each sentence
48
  results = []
 
51
 
52
  # Create a DataFrame from the results
53
  df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
54
+ df['Text'] = segmented_sentences
55
 
56
  # Generate the pie chart and excel file
57
  pie_chart_name = generate_pie_chart(df)
 
59
 
60
  return excel_file_path, pie_chart_name
61
 
 
62
  def analyze_text(text, docx_file):
63
  if text:
64
+ # Segment the text into sentences
65
+ segmented_text = segmentation(text)
66
  results = []
67
  for sentence in segmented_text:
68
  results.append(analyze(sentence))
69
 
70
  df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive'])
71
+ df['Text'] = segmented_text
72
  pie_chart_name = generate_pie_chart(df)
73
  excel_file_path = generate_excel_file(df)
74
  return excel_file_path, pie_chart_name
 
80
  # No input provided
81
  return None
82
 
 
83
  def generate_pie_chart(df):
84
  # Calculate the average scores
85
  neg_avg = df['Negative'].mean()
 
97
  plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%')
98
  plt.title('Average Scores by Sentiment')
99
 
100
+ # Save the pie chart as an image file
101
  pie_chart_name = 'pie_chart.png'
102
  plt.savefig(pie_chart_name)
103
  plt.close()
104
 
105
  return pie_chart_name
106
 
 
107
  def generate_excel_file(df):
108
  # Create a new workbook and worksheet
109
  wb = openpyxl.Workbook()
 
153
 
154
  return excel_file_path
155
 
 
156
  inputs = [
157
  gr.Textbox(label="Nhập Văn Bản bằng Tiếng Việt để trải nghiệm ngay"),
158
  gr.File(label="Chọn Tệp File Word(docx) Bạn Muốn Phân Tích")
 
173
 
174
  if __name__ == "__main__":
175
  interface.launch()
176
+