Spaces:
Runtime error
Runtime error
File size: 5,491 Bytes
a5adcd2 efcdc3b a5adcd2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
# installed pip packages
# pip install streamlit
# pip install beautifulsoup4
# pip install docx2txt
# pip install pypdf2
# pip install pdfplumber
import streamlit as st
# File Processing pkgs
from PIL import Image
import requests
from bs4 import BeautifulSoup
import json
import docx2txt
# import textract
from PyPDF2 import PdfFileReader
import pdfplumber
# ---- LOAD ASSETS ----
img_page_icon = Image.open("web_icon.jpeg")
# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
# Load CSS file
def load_css(file_path):
with open(file_path) as f:
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
# Load CSS file
load_css('styles.css')
# ---- HEADER SECTION ----
with st.container():
st.subheader("Hi, username :wave:")
st.write("##")
st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
unsafe_allow_html=True)
st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
# st.title("Odia Generative AI")
st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
# ---- BODY SECTION ----
with st.container():
st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
# ----- FUNCTIONS ----
# function to get the text from pdf using PyPDF2
def read_pdf(file):
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
# all_page_text = ""
# for i in range(count):
# page = pdfReader.getPage(i)
# all_page_text += page.extractText()
#
# return all_page_text
return count
# function to run the enter button
def run_function(url , documents):
news = ""
# Check if the user has provided a URL
if url:
try:
# Make a GET request to the URL and extract the text content
response = requests.get(url)
if response.status_code == 200:
text_content = response.text
soup = BeautifulSoup(text_content, 'html.parser')
# Extracting the header
# Extracting the script tag which includes the heading
heading = soup.find('script', type='application/ld+json')
# Extract the JSON data from the script tag
json_data_heading = heading.string
# Load the JSON data into a Python dictionary
data = json.loads(json_data_heading)
headline = data['headline']
body = soup.find('div', class_='oi-article-lt')
# Find all <p> tags within the div_tag
p_tags = body.find_all('p')
# Extract the text content from each <p> tag
paragraphs = [p.get_text(strip=True) for p in p_tags]
paragraphs = '\n'.join(paragraphs)
news = news + (headline + '\n\n' + paragraphs)
# Display the extracted text content from url
st.text_area("Extracted Text", value=news, height=200)
else:
st.error("Error: Unable to fetch content from the provided URL.")
except requests.exceptions.RequestException as e:
st.error("Error: An exception occurred while fetching content from the URL.")
# Check if the user has provided a document
elif documents is not None:
for document in documents:
document_details = {
"filename":document.name,
"filetype":document.type,
"filesize":document.size
}
st.write(document_details)
# Extract content from the txt file
if document.type == "text/plain":
# Read as bytes
news += str(document.read(), "utf-8")
# Extract content from the pdf file
elif document.type == "application/pdf":
# using PyPDF2
# news += read_pdf(document)
# using pdfplumber
try:
with pdfplumber.open(document) as pdf:
all_text = ""
for page in pdf.pages:
text = page.extract_text()
all_text += text + "\n"
news += all_text
except:
st.write("None")
# Extract content from the docx file
else:
news += docx2txt.process(document)
# Display the extracted text content from file
st.text_area("Extracted Text", value=news, height=200)
else:
st.error("Error: An error occurred while fetching content .")
col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
with col1:
url = st.text_input(label='', placeholder="Enter URL")
with col2:
documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)
with col3:
b = st.button("Enter")
if b:
run_function(url, documents)
|