import streamlit as st import sparknlp from sparknlp.base import * from sparknlp.annotator import * from pyspark.ml import Pipeline from annotated_text import annotated_text # Page configuration st.set_page_config( layout="wide", initial_sidebar_state="auto" ) # CSS for styling st.markdown(""" """, unsafe_allow_html=True) @st.cache_resource def init_spark(): return sparknlp.start() @st.cache_resource def create_pipeline(): document_assembler = DocumentAssembler() \ .setInputCol('text') \ .setOutputCol('document') tokenizer = Tokenizer() \ .setInputCols(['document']) \ .setOutputCol('token') sequence_classifier = XlmRoBertaForSequenceClassification.pretrained('xlm_roberta_base_sequence_classifier_imdb', 'en') \ .setInputCols(["document", "token"]) \ .setOutputCol("class") pipeline = Pipeline(stages=[document_assembler, tokenizer, sequence_classifier]) return pipeline def fit_data(pipeline, data): empty_df = spark.createDataFrame([['']]).toDF('text') pipeline_model = pipeline.fit(empty_df) model = LightPipeline(pipeline_model) result = model.fullAnnotate(data) return result def annotate(data): document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] annotated_words = [] for chunk, label in zip(chunks, labels): parts = document.split(chunk, 1) if parts[0]: annotated_words.append(parts[0]) annotated_words.append((chunk, label)) document = parts[1] if document: annotated_words.append(document) annotated_text(*annotated_words) tasks_models_descriptions = { "Sequence Classification": { "models": ["xlm_roberta_base_sequence_classifier_imdb"], "description": "The 'xlm_roberta_base_sequence_classifier_imdb' model is specialized for sentiment analysis of movie reviews. It accurately classifies IMDb reviews as positive or negative, leveraging the multilingual capabilities of XLM-RoBERTa to analyze text content and sentiment across different languages." } } # Sidebar content task = st.sidebar.selectbox("Choose the task", list(tasks_models_descriptions.keys())) model = st.sidebar.selectbox("Choose the pretrained model", tasks_models_descriptions[task]["models"], help="For more info about the models visit: https://sparknlp.org/models") # Reference notebook link in sidebar link = """ """ st.sidebar.markdown('Reference notebook:') st.sidebar.markdown(link, unsafe_allow_html=True) # Page content title, sub_title = (f'DeBERTa for {task}', tasks_models_descriptions[task]["description"]) st.markdown(f'