poem_analysis / analyze_poems.py
esocoder's picture
first commit
996aa19
raw
history blame
No virus
6.64 kB
import os
import streamlit as st
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from transformers import pipeline
import nltk
import numpy as np
from utils import read_poems_from_directory, emotion_labels_with_colors
# Download nltk data for tokenization
nltk.download('punkt')
# Initialize emotion classifier pipelines
models = {
"Model 1": pipeline('sentiment-analysis', model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True),
"Model 2": pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-emotion", return_all_scores=True)
}
poems_directory = "./poems"
os.makedirs(poems_directory, exist_ok=True)
def analyze_poems_page():
st.header("Analyze Poems")
# Sidebar for file upload and listing files
st.sidebar.title("Upload New Poem")
uploaded_file = st.sidebar.file_uploader("Choose a text file", type="txt")
if uploaded_file is not None:
with open(os.path.join(poems_directory, uploaded_file.name), "wb") as f:
f.write(uploaded_file.getbuffer())
st.sidebar.success(f"Uploaded {uploaded_file.name}")
st.sidebar.title("Available Poems")
poem_files = [f for f in os.listdir(poems_directory) if f.endswith(".txt")]
st.sidebar.write("\n".join(poem_files))
# Sidebar input for user-specified labels
user_labels_input = st.sidebar.text_input("Enter emotion labels (comma-separated)",
"happiness,sadness,anger,fear,disgust,surprise,love,joy,anxiety,contentment,frustration,loneliness,excitement,guilt,shame,envy,jealousy,pride,gratitude,empathy,compassion,boredom,relief,curiosity,awe,confusion,nostalgia,hope,despair,embarrassment")
user_labels = [label.strip() for label in user_labels_input.split(",")]
if st.button("Analyze Poems"):
if os.path.isdir(poems_directory):
# Read poems from the specified directory
poems = read_poems_from_directory(poems_directory)
if poems:
def analyze_emotions(poem, model):
lines = nltk.sent_tokenize(poem)
emotions = []
for line in lines:
result = model(line)
emotions.append(result)
return emotions
def process_emotions(emotions):
emotion_scores = []
all_labels = set()
for line_emotions in emotions:
line_score = {emo['label']: emo['score'] for emo in line_emotions[0]}
all_labels.update(line_score.keys())
emotion_scores.append(line_score)
return emotion_scores, all_labels
def plot_emotional_arc(processed_emotions, labels, model_name):
st.subheader(model_name)
plt.figure(figsize=(15, 10))
for i, emotions in enumerate(processed_emotions):
for emotion in labels:
emotion_arc = [line_emotions.get(emotion, 0) for line_emotions in emotions]
color = emotion_labels_with_colors.get(emotion, 'black') # default to black if not found
plt.plot(emotion_arc, label=f'Poem {i+1} - {emotion}', color=color)
plt.title(f'Emotional Arc of Each Poem ({model_name})')
plt.xlabel('Line Number')
plt.ylabel('Emotion Score')
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
st.pyplot(plt)
def extract_features(emotion_data, labels):
features = []
for emotions in emotion_data:
poem_features = []
for label in labels:
scores = [line_emotions.get(label, 0) for line_emotions in emotions]
mean_score = np.mean(scores)
std_score = np.std(scores)
poem_features.extend([mean_score, std_score])
features.append(poem_features)
return features
# Analyze and plot for each model
for model_name, model in models.items():
poem_emotions = [analyze_emotions(poem, model) for poem in poems]
processed_emotions = []
all_labels = set()
for emotions in poem_emotions:
processed, labels = process_emotions(emotions)
processed_emotions.append(processed)
all_labels.update(labels)
selected_labels = [label for label in user_labels if label in all_labels]
plot_emotional_arc(processed_emotions, selected_labels, model_name)
# Extract features for clustering
features = extract_features(processed_emotions, selected_labels)
# Create a DataFrame to store the features
columns = []
for label in selected_labels:
columns.extend([f'{label}_mean', f'{label}_std'])
df = pd.DataFrame(features, columns=columns)
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)
# Apply KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(scaled_features)
df['Cluster'] = kmeans.labels_
# Display the DataFrame
st.write(f"Poem Sentiment Features and Clusters ({model_name}):")
st.dataframe(df)
# Visualize the clusters
if not df.empty:
plt.figure(figsize=(8, 6))
plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=df['Cluster'], cmap='viridis', marker='o')
plt.title(f'Clusters of Poem Emotional Arcs ({model_name})')
plt.xlabel(f'{columns[0]}')
plt.ylabel(f'{columns[1]}')
st.pyplot(plt)
else:
st.warning("No text files found in the specified directory.")
else:
st.error("The specified path is not a valid directory.")