Spaces:

EsoCode
/

poem_analysis

Sleeping

App Files Files Community

poem_analysis / analyze_poems.py

esocoder

first commit

996aa19 3 months ago

raw

history blame

6.64 kB

	import os
	import streamlit as st
	import matplotlib.pyplot as plt
	import pandas as pd
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler
	from transformers import pipeline
	import nltk
	import numpy as np
	from utils import read_poems_from_directory, emotion_labels_with_colors

	# Download nltk data for tokenization
	nltk.download('punkt')

	# Initialize emotion classifier pipelines
	models = {
	"Model 1": pipeline('sentiment-analysis', model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True),
	"Model 2": pipeline('sentiment-analysis', model="cardiffnlp/twitter-roberta-base-emotion", return_all_scores=True)
	}

	poems_directory = "./poems"
	os.makedirs(poems_directory, exist_ok=True)

	def analyze_poems_page():
	st.header("Analyze Poems")

	# Sidebar for file upload and listing files
	st.sidebar.title("Upload New Poem")
	uploaded_file = st.sidebar.file_uploader("Choose a text file", type="txt")

	if uploaded_file is not None:
	with open(os.path.join(poems_directory, uploaded_file.name), "wb") as f:
	f.write(uploaded_file.getbuffer())
	st.sidebar.success(f"Uploaded {uploaded_file.name}")

	st.sidebar.title("Available Poems")
	poem_files = [f for f in os.listdir(poems_directory) if f.endswith(".txt")]
	st.sidebar.write("\n".join(poem_files))

	# Sidebar input for user-specified labels
	user_labels_input = st.sidebar.text_input("Enter emotion labels (comma-separated)",
	"happiness,sadness,anger,fear,disgust,surprise,love,joy,anxiety,contentment,frustration,loneliness,excitement,guilt,shame,envy,jealousy,pride,gratitude,empathy,compassion,boredom,relief,curiosity,awe,confusion,nostalgia,hope,despair,embarrassment")
	user_labels = [label.strip() for label in user_labels_input.split(",")]

	if st.button("Analyze Poems"):
	if os.path.isdir(poems_directory):
	# Read poems from the specified directory
	poems = read_poems_from_directory(poems_directory)

	if poems:
	def analyze_emotions(poem, model):
	lines = nltk.sent_tokenize(poem)
	emotions = []
	for line in lines:
	result = model(line)
	emotions.append(result)
	return emotions

	def process_emotions(emotions):
	emotion_scores = []
	all_labels = set()
	for line_emotions in emotions:
	line_score = {emo['label']: emo['score'] for emo in line_emotions[0]}
	all_labels.update(line_score.keys())
	emotion_scores.append(line_score)
	return emotion_scores, all_labels

	def plot_emotional_arc(processed_emotions, labels, model_name):
	st.subheader(model_name)
	plt.figure(figsize=(15, 10))
	for i, emotions in enumerate(processed_emotions):
	for emotion in labels:
	emotion_arc = [line_emotions.get(emotion, 0) for line_emotions in emotions]
	color = emotion_labels_with_colors.get(emotion, 'black') # default to black if not found
	plt.plot(emotion_arc, label=f'Poem {i+1} - {emotion}', color=color)
	plt.title(f'Emotional Arc of Each Poem ({model_name})')
	plt.xlabel('Line Number')
	plt.ylabel('Emotion Score')
	plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
	st.pyplot(plt)

	def extract_features(emotion_data, labels):
	features = []
	for emotions in emotion_data:
	poem_features = []
	for label in labels:
	scores = [line_emotions.get(label, 0) for line_emotions in emotions]
	mean_score = np.mean(scores)
	std_score = np.std(scores)
	poem_features.extend([mean_score, std_score])
	features.append(poem_features)
	return features

	# Analyze and plot for each model
	for model_name, model in models.items():
	poem_emotions = [analyze_emotions(poem, model) for poem in poems]
	processed_emotions = []
	all_labels = set()
	for emotions in poem_emotions:
	processed, labels = process_emotions(emotions)
	processed_emotions.append(processed)
	all_labels.update(labels)
	selected_labels = [label for label in user_labels if label in all_labels]
	plot_emotional_arc(processed_emotions, selected_labels, model_name)

	# Extract features for clustering
	features = extract_features(processed_emotions, selected_labels)

	# Create a DataFrame to store the features
	columns = []
	for label in selected_labels:
	columns.extend([f'{label}_mean', f'{label}_std'])
	df = pd.DataFrame(features, columns=columns)

	# Standardize the features
	scaler = StandardScaler()
	scaled_features = scaler.fit_transform(df)

	# Apply KMeans clustering
	kmeans = KMeans(n_clusters=2, random_state=42)
	kmeans.fit(scaled_features)
	df['Cluster'] = kmeans.labels_

	# Display the DataFrame
	st.write(f"Poem Sentiment Features and Clusters ({model_name}):")
	st.dataframe(df)

	# Visualize the clusters
	if not df.empty:
	plt.figure(figsize=(8, 6))
	plt.scatter(df.iloc[:, 0], df.iloc[:, 1], c=df['Cluster'], cmap='viridis', marker='o')
	plt.title(f'Clusters of Poem Emotional Arcs ({model_name})')
	plt.xlabel(f'{columns[0]}')
	plt.ylabel(f'{columns[1]}')
	st.pyplot(plt)
	else:
	st.warning("No text files found in the specified directory.")
	else:
	st.error("The specified path is not a valid directory.")