|
import streamlit as st |
|
import sparknlp |
|
|
|
from sparknlp.base import * |
|
from sparknlp.annotator import * |
|
from pyspark.ml import Pipeline |
|
from annotated_text import annotated_text |
|
|
|
|
|
st.set_page_config( |
|
layout="wide", |
|
initial_sidebar_state="auto" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-title { |
|
font-size: 36px; |
|
color: #4A90E2; |
|
font-weight: bold; |
|
text-align: center; |
|
} |
|
.section { |
|
background-color: #f9f9f9; |
|
padding: 10px; |
|
border-radius: 10px; |
|
margin-top: 10px; |
|
} |
|
.section p, .section ul { |
|
color: #666666; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
@st.cache_resource |
|
def init_spark(): |
|
return sparknlp.start() |
|
|
|
@st.cache_resource |
|
def create_pipeline(): |
|
document_assembler = DocumentAssembler() \ |
|
.setInputCol('text') \ |
|
.setOutputCol('document') |
|
|
|
tokenizer = Tokenizer() \ |
|
.setInputCols(['document']) \ |
|
.setOutputCol('token') |
|
|
|
sequence_classifier = XlmRoBertaForSequenceClassification.pretrained('xlm_roberta_base_sequence_classifier_imdb', 'en') \ |
|
.setInputCols(["document", "token"]) \ |
|
.setOutputCol("class") |
|
|
|
pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier]) |
|
return pipeline |
|
|
|
def fit_data(pipeline, data): |
|
empty_df = spark.createDataFrame([['']]).toDF('text') |
|
pipeline_model = pipeline.fit(empty_df) |
|
model = LightPipeline(pipeline_model) |
|
result = model.fullAnnotate(data) |
|
return result |
|
|
|
def annotate(data): |
|
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] |
|
annotated_words = [] |
|
for chunk, label in zip(chunks, labels): |
|
parts = document.split(chunk, 1) |
|
if parts[0]: |
|
annotated_words.append(parts[0]) |
|
annotated_words.append((chunk, label)) |
|
document = parts[1] |
|
if document: |
|
annotated_words.append(document) |
|
annotated_text(*annotated_words) |
|
|
|
tasks_models_descriptions = { |
|
"Sequence Classification": { |
|
"models": ["xlm_roberta_base_sequence_classifier_imdb"], |
|
"description": "The 'xlm_roberta_base_sequence_classifier_imdb' model is specialized for sentiment analysis of movie reviews. It accurately classifies IMDb reviews as positive or negative, leveraging the multilingual capabilities of XLM-RoBERTa to analyze text content and sentiment across different languages." |
|
} |
|
} |
|
|
|
|
|
task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys())) |
|
model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models") |
|
|
|
|
|
link = """ |
|
<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/357691d18373d6e8f13b5b1015137a398fd0a45f/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L103"> |
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/> |
|
</a> |
|
""" |
|
st.sidebar.markdown('Reference notebook:') |
|
st.sidebar.markdown(link, unsafe_allow_html=True) |
|
|
|
|
|
title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"]) |
|
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True) |
|
container = st.container(border=True) |
|
container.write(sub_title) |
|
|
|
|
|
examples_mapping = { |
|
"Sequence Classification": [ |
|
"This movie was absolutely fantastic! The storyline was gripping, the characters were well-developed, and the cinematography was stunning. I was on the edge of my seat the entire time.", |
|
"A heartwarming and beautiful film. The performances were top-notch, and the direction was flawless. This is easily one of the best movies I've seen this year.", |
|
"What a delightful surprise! The humor was spot on, and the plot was refreshingly original. The cast did an amazing job bringing the characters to life. Highly recommended!", |
|
"This was one of the worst movies I’ve ever seen. The plot was predictable, the acting was wooden, and the pacing was painfully slow. I couldn’t wait for it to end.", |
|
"A complete waste of time. The movie lacked any real substance or direction, and the dialogue was cringe-worthy. I wouldn’t recommend this to anyone.", |
|
"I had high hopes for this film, but it turned out to be a huge disappointment. The story was disjointed, and the special effects were laughably bad. Don’t bother watching this one.", |
|
"The movie was okay, but nothing special. It had a few good moments, but overall, it felt pretty average. Not something I would watch again, but it wasn’t terrible either.", |
|
"An average film with a decent plot. The acting was passable, but it didn't leave much of an impression on me. It's a movie you might watch once and forget about.", |
|
"This movie was neither good nor bad, just kind of there. It had some interesting ideas, but they weren’t executed very well. It’s a film you could take or leave." |
|
] |
|
} |
|
|
|
examples = examples_mapping[task] |
|
selected_text = st.selectbox("Select an example", examples) |
|
custom_input = st.text_input("Try it with your own Sentence!") |
|
|
|
try: |
|
text_to_analyze = custom_input if custom_input else selected_text |
|
st.subheader('Full example text') |
|
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>""" |
|
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) |
|
except: |
|
text_to_analyze = selected_text |
|
|
|
|
|
spark = init_spark() |
|
pipeline = create_pipeline() |
|
output = fit_data(pipeline, text_to_analyze) |
|
|
|
|
|
st.subheader("Prediction:") |
|
st.markdown(f"Classified as : **{output[0]['class'][0].result}**") |
|
|
|
|
|
|