|
import streamlit as st |
|
import pandas as pd |
|
|
|
import sparknlp |
|
from sparknlp.base import DocumentAssembler |
|
from sparknlp.annotator import AutoGGUFModel |
|
from pyspark.ml import Pipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-title { |
|
font-size: 36px; |
|
color: #4A90E2; |
|
font-weight: bold; |
|
text-align: center; |
|
} |
|
.section { |
|
background-color: #f9f9f9; |
|
padding: 10px; |
|
border-radius: 10px; |
|
margin-top: 10px; |
|
} |
|
.section p, .section ul { |
|
color: #666666; |
|
} |
|
.table { |
|
width: 100%; |
|
border-collapse: collapse; |
|
margin-top: 20px; |
|
} |
|
.table th, .table td { |
|
border: 1px solid #ddd; |
|
padding: 8px; |
|
text-align: left; |
|
} |
|
.table th { |
|
background-color: #4A90E2; |
|
color: white; |
|
} |
|
.table td { |
|
background-color: #f2f2f2; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
@st.cache_resource |
|
def init_spark(): |
|
return sparknlp.start() |
|
|
|
|
|
@st.cache_resource |
|
def create_pipeline(model, prompt): |
|
documentAssembler = DocumentAssembler() \ |
|
.setInputCol("text") \ |
|
.setOutputCol("document") |
|
|
|
autoGGUFModel = (AutoGGUFModel.pretrained(model) |
|
.setInputCols(["document"]) |
|
.setOutputCol("completions") |
|
.setSystemPrompt(prompt) |
|
.setNPredict(-1) |
|
.setTemperature(0.2) |
|
.setTopP(0.9) |
|
.setRepeatPenalty(1.3)) |
|
|
|
pipeline = Pipeline().setStages([documentAssembler, autoGGUFModel]) |
|
return pipeline |
|
|
|
|
|
def fit_data(pipeline, text): |
|
df = spark.createDataFrame([[text]]).toDF("text") |
|
result = pipeline.fit(df).transform(df) |
|
return result |
|
|
|
tasks_prompt_map = { |
|
"Summarization": "You are a summarization assistant. Provide a concise and accurate summary of the given text, focusing on the main ideas and key points. Avoid unnecessary details and ensure clarity.", |
|
"Text Completion": "You are a creative and precise assistant. Complete the given text naturally and fluently, ensuring coherence with the provided context and maintaining the tone and style.", |
|
"Translation": "You are a professional translator. Translate the given text accurately and naturally, preserving its meaning, tone, and context. Ensure fluency and clarity in the target language.", |
|
"Paraphrasing": "You are a paraphrasing assistant. Rewrite the given text to convey the same meaning in a different way, ensuring clarity and grammatical accuracy without altering the original intent.", |
|
"Question Answering": "You are an expert question-answering assistant. Based on the provided context, give accurate and concise answers to the questions. Ensure your responses are clear and directly address the query.", |
|
"Story Generation": "You are a creative storyteller. Write an engaging story based on the given theme or prompt. Ensure the story has a clear beginning, middle, and end with compelling characters.", |
|
"Dialogue Generation": "You are a conversational assistant. Generate a realistic and engaging dialogue based on the given scenario or context. Ensure the conversation flows naturally and matches the tone or personalities described.", |
|
"Code Generation": "You are a coding assistant. Write clean, efficient, and error-free code to solve the given problem or implement the specified functionality. Adhere to best practices and include comments as needed.", |
|
"Poetry Generation": "You are a poet. Compose a creative and expressive poem based on the given theme or prompt. Ensure it has a clear tone and evokes emotion." |
|
} |
|
|
|
|
|
examples = [ |
|
"Mount Tai is a mountain of historical and cultural significance located north of the city of Tai'an, in Shandong province, China. The tallest peak is the Jade Emperor Peak, which is commonly reported as being 1,545 meters tall, but is officially described by the PRC government as 1,532.7 meters tall. It is associated with sunrise, birth, and renewal, and is often regarded the foremost of the five. Mount Tai has been a place of worship for at least 3,000 years and served as one of the most important ceremonial centers of China during large portions of this period.", |
|
"The Guadeloupe amazon (Amazona violacea) is a hypothetical extinct species of parrot that is thought to have been endemic to the Lesser Antillean island region of Guadeloupe. Described by 17th- and 18th-century writers, it is thought to have been related to, or possibly the same as, the extant imperial amazon. A tibiotarsus and an ulna bone from the island of Marie-Galante may belong to the Guadeloupe amazon. According to contemporary descriptions, its head, neck and underparts were mainly violet or slate, mixed with green and black; the back was brownish green; and the wings were green, yellow and red. It had iridescent feathers, and was able to raise a \"ruff\" of feathers around its neck. It fed on fruits and nuts, and the male and female took turns sitting on the nest. French settlers ate the birds and destroyed their habitat. Rare by 1779, the species appears to have become extinct by the end of the 18th century.", |
|
"Pierre-Simon, marquis de Laplace (23 March 1749 – 5 March 1827) was a French scholar and polymath whose work was important to the development of engineering, mathematics, statistics, physics, astronomy, and philosophy. He summarized and extended the work of his predecessors in his five-volume Mécanique Céleste (Celestial Mechanics) (1799–1825). This work translated the geometric study of classical mechanics to one based on calculus, opening up a broader range of problems. In statistics, the Bayesian interpretation of probability was developed mainly by Laplace.", |
|
"John Snow (15 March 1813 – 16 June 1858) was an English physician and a leader in the development of anaesthesia and medical hygiene. He is considered one of the founders of modern epidemiology, in part because of his work in tracing the source of a cholera outbreak in Soho, London, in 1854, which he curtailed by removing the handle of a water pump. Snow's findings inspired the adoption of anaesthesia as well as fundamental changes in the water and waste systems of London, which led to similar changes in other cities, and a significant improvement in general public health around the world.", |
|
"The Mona Lisa is a half-length portrait painting by Italian artist Leonardo da Vinci. Considered an archetypal masterpiece of the Italian Renaissance, it has been described as \"the best known, the most visited, the most written about, the most sung about, the most parodied work of art in the world\". The painting's novel qualities include the subject's enigmatic expression, the monumentality of the composition, the subtle modelling of forms, and the atmospheric illusionism.", |
|
"""Calculus, originally called infinitesimal calculus or "the calculus of infinitesimals", is the mathematical study of continuous change, in the same way that geometry is the study of shape and algebra is the study of generalizations of arithmetic operations. It has two major branches, differential calculus and integral calculus; the former concerns instantaneous rates of change, and the slopes of curves, while integral calculus concerns accumulation of quantities, and areas under or between curves. These two branches are related to each other by the fundamental theorem of calculus, and they make use of the fundamental notions of convergence of infinite sequences and infinite series to a well-defined limit.[1] Infinitesimal calculus was developed independently in the late 17th century by Isaac Newton and Gottfried Wilhelm Leibniz.[2][3] Today, calculus has widespread uses in science, engineering, and economics.[4] In mathematics education, calculus denotes courses of elementary mathematical analysis, which are mainly devoted to the study of functions and limits. The word calculus (plural calculi) is a Latin word, meaning originally "small pebble" (this meaning is kept in medicine – see Calculus (medicine)). Because such pebbles were used for calculation, the meaning of the word has evolved and today usually means a method of computation. It is therefore used for naming specific methods of calculation and related theories, such as propositional calculus, Ricci calculus, calculus of variations, lambda calculus, and process calculus.""", |
|
] |
|
|
|
model_list = [ |
|
"phi3.5_mini_4k_instruct_q4_gguf", "meta_llama_3_8b_instruct_iq3_m", |
|
"qwen2.5_3b_instruct_q3_k_l", "mistral_7b_instruct_v0.3_q3_k_l" |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
title = "GGUF (General-purpose Graph Universal Format) in Spark NLP" |
|
sub_title = "Showcasing the Power of AutoGGUFModel in Spark NLP for various Text Generation Tasks" |
|
|
|
st.markdown(f'<div class="main-title">{title}</div>', unsafe_allow_html=True) |
|
st.markdown(f'<div style="text-align: center; color: #666666;">{sub_title}</div>', unsafe_allow_html=True) |
|
|
|
col1, col2 = st.columns(2) |
|
with col1: task = st.selectbox("Task:", tasks_prompt_map.keys()) |
|
with col2: model = st.selectbox("Model:", model_list) |
|
|
|
|
|
selected_text = st.selectbox("Select an example", examples) |
|
custom_input = st.text_input("Try it with your own sentence!") |
|
|
|
text_to_analyze = custom_input if custom_input else selected_text |
|
|
|
st.write('Text to analyze:') |
|
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; |
|
border: 1px solid #e6e9ef; border-radius: 0.25rem; |
|
padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>""" |
|
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) |
|
|
|
|
|
spark = init_spark() |
|
pipeline = create_pipeline(model, tasks_prompt_map[task]) |
|
output = fit_data(pipeline, text_to_analyze) |
|
|
|
st.write(output.select("completions.result").collect()) |