Spaces:
Sleeping
Sleeping
from transformers import RobertaForSequenceClassification, AutoTokenizer | |
import torch | |
import docx2txt | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import openpyxl | |
from openpyxl.styles import Font, Color, PatternFill | |
from openpyxl.styles.colors import WHITE | |
import gradio as gr | |
import underthesea | |
import re | |
# Load the model and tokenizer | |
senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment") | |
senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False) | |
def segmentation(text): | |
# Split text by periods and newlines | |
sentences = re.split(r'[.\n]', text) | |
segmented_sentences = [] | |
for sentence in sentences: | |
sentence = sentence.strip() | |
if sentence: # Ignore empty sentences | |
segmented_sentence = underthesea.word_tokenize(sentence) | |
segmented_sentences.append(' '.join(segmented_sentence)) | |
return segmented_sentences | |
def analyze(sentence): | |
input_ids = torch.tensor([senti_tokenizer.encode(sentence)]) | |
with torch.no_grad(): | |
out = senti_model(input_ids) | |
results = out.logits.softmax(dim=-1).tolist() | |
return results[0] | |
def read_file(docx): | |
try: | |
text = docx2txt.process(docx) | |
return text | |
except Exception as e: | |
print(f"Error reading file: {e}") | |
def process_file(docx): | |
# Read the file | |
text = read_file(docx) | |
# Segment the text into sentences | |
segmented_sentences = segmentation(text) | |
# Analyze the sentiment of each sentence | |
results = [] | |
for sentence in segmented_sentences: | |
results.append(analyze(sentence)) | |
# Create a DataFrame from the results | |
df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive']) | |
df['Text'] = segmented_sentences | |
# Generate the pie chart and excel file | |
pie_chart_name = generate_pie_chart(df) | |
excel_file_path = generate_excel_file(df) | |
return excel_file_path, pie_chart_name | |
def analyze_text(text, docx_file): | |
if text: | |
# Segment the text into sentences | |
segmented_text = segmentation(text) | |
results = [] | |
for sentence in segmented_text: | |
results.append(analyze(sentence)) | |
df = pd.DataFrame(results, columns=['Negative', 'Neutral', 'Positive']) | |
df['Text'] = segmented_text | |
pie_chart_name = generate_pie_chart(df) | |
excel_file_path = generate_excel_file(df) | |
return excel_file_path, pie_chart_name | |
elif docx_file: | |
return process_file(docx_file.name) | |
else: | |
# No input provided | |
return None | |
def generate_pie_chart(df): | |
# Calculate the average scores | |
neg_avg = df['Negative'].mean() | |
neu_avg = df['Neutral'].mean() | |
pos_avg = df['Positive'].mean() | |
# Create a new DataFrame with the average scores | |
avg_df = pd.DataFrame({'Sentiment': ['Negative', 'Neutral', 'Positive'], | |
'Score': [neg_avg, neu_avg, pos_avg]}) | |
# Set custom colors for the pie chart | |
colors = ['#BDBDBD', '#87CEFA', '#9ACD32'] | |
# Create a pie chart showing the average scores | |
plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%') | |
plt.title('Average Scores by Sentiment') | |
# Save the pie chart as an image file | |
pie_chart_name = 'pie_chart.png' | |
plt.savefig(pie_chart_name) | |
plt.close() | |
return pie_chart_name | |
def generate_excel_file(df): | |
# Create a new workbook and worksheet | |
wb = openpyxl.Workbook() | |
ws = wb.active | |
# Add column headers to the worksheet | |
headers = ['Negative', 'Neutral', 'Positive', 'Text'] | |
for col_num, header in enumerate(headers, 1): | |
cell = ws.cell(row=1, column=col_num) | |
cell.value = header | |
cell.font = Font(bold=True) | |
# Set up cell formatting for each sentiment | |
fill_dict = { | |
'Negative': PatternFill(start_color='BDBDBD', end_color='BDBDBD', fill_type='solid'), | |
'Neutral': PatternFill(start_color='87CEFA', end_color='87CEFA', fill_type='solid'), | |
'Positive': PatternFill(start_color='9ACD32', end_color='9ACD32', fill_type='solid') | |
} | |
# Loop through each row of the input DataFrame and write data to the worksheet | |
for row_num, row_data in df.iterrows(): | |
# Calculate the highest score and corresponding sentiment for this row | |
sentiment_cols = ['Negative', 'Neutral', 'Positive'] | |
scores = [row_data[col] for col in sentiment_cols] | |
max_score = max(scores) | |
max_index = scores.index(max_score) | |
sentiment = sentiment_cols[max_index] | |
# Write the data to the worksheet | |
for col_num, col_data in enumerate(row_data, 1): | |
cell = ws.cell(row=row_num + 2, column=col_num) | |
cell.value = col_data | |
if col_num in [1, 2, 3]: | |
if col_data == max_score: | |
cell.fill = fill_dict[sentiment] | |
if col_num == 4: | |
fill = fill_dict[sentiment] | |
font_color = WHITE if fill.start_color.rgb == 'BDBDBD' else Color('000000') | |
cell.fill = fill | |
cell.font = Font(color=font_color) | |
if col_data == max_score: | |
cell.fill = fill_dict[sentiment] | |
# Save the workbook | |
excel_file_path = 'result.xlsx' | |
wb.save(excel_file_path) | |
return excel_file_path | |
def analyze_from_text(text): | |
return analyze_text(text, None) | |
def analyze_from_file(docx_file): | |
return analyze_text(None, docx_file) | |
inputs = [ | |
gr.Textbox(label="Nhập Văn Bản bằng Tiếng Việt để trải nghiệm ngay"), | |
gr.File(label="Chọn Tệp File Word(docx) Bạn Muốn Phân Tích") | |
] | |
outputs = [ | |
gr.File(label="Kết Quả Phân Tích Excel"), | |
gr.Image(type="filepath", label="Biểu đồ") | |
] | |
interface = gr.Interface( | |
fn=analyze_text, | |
inputs=inputs, | |
outputs=outputs, | |
title="Phân Tích Cảm xúc thông qua Hội Thoại bằng Tiếng Việt", | |
allow_flagging="never" # Disable flag button | |
) | |
if __name__ == "__main__": | |
interface.launch() | |