File size: 5,491 Bytes
a5adcd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efcdc3b
a5adcd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# installed pip packages
# pip install streamlit
# pip install beautifulsoup4
# pip install docx2txt
# pip install pypdf2
# pip install pdfplumber

import streamlit as st

# File Processing pkgs
from PIL import Image
import requests
from bs4 import BeautifulSoup
import json
import docx2txt
# import textract
from PyPDF2 import PdfFileReader
import pdfplumber


# ---- LOAD ASSETS ----
img_page_icon = Image.open("web_icon.jpeg")

# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")

# Load CSS file
def load_css(file_path):
    with open(file_path) as f:
        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

# Load CSS file
load_css('styles.css')



# ---- HEADER SECTION ----
with st.container():
    st.subheader("Hi, username :wave:")
    st.write("##")
    st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
                unsafe_allow_html=True)
    st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
    # st.title("Odia Generative AI")

    st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)



# ---- BODY SECTION ----
with st.container():
    st.subheader("Collecting monolingual data (Odia or any Indic Languages)")

    # ----- FUNCTIONS ----
    # function to get the text from pdf using PyPDF2
    def read_pdf(file):
        pdfReader = PdfFileReader(file)
        count = pdfReader.numPages
        # all_page_text = ""
        # for i in range(count):
        #     page = pdfReader.getPage(i)
        #     all_page_text += page.extractText()
        #
        # return all_page_text
        return count

    # function to run the enter button
    def run_function(url , documents):
        news = ""
        # Check if the user has provided a URL
        if url:
            try:
                # Make a GET request to the URL and extract the text content
                response = requests.get(url)
                if response.status_code == 200:
                    text_content = response.text

                    soup = BeautifulSoup(text_content, 'html.parser')

                    # Extracting the header
                    # Extracting the script tag which includes the heading
                    heading = soup.find('script', type='application/ld+json')

                    # Extract the JSON data from the script tag
                    json_data_heading = heading.string

                    # Load the JSON data into a Python dictionary
                    data = json.loads(json_data_heading)
                    headline = data['headline']

                    body = soup.find('div', class_='oi-article-lt')
                    # Find all <p> tags within the div_tag
                    p_tags = body.find_all('p')

                    # Extract the text content from each <p> tag
                    paragraphs = [p.get_text(strip=True) for p in p_tags]
                    paragraphs = '\n'.join(paragraphs)

                    news = news + (headline + '\n\n' + paragraphs)

                    # Display the extracted text content from url
                    st.text_area("Extracted Text", value=news, height=200)

                else:
                    st.error("Error: Unable to fetch content from the provided URL.")
            except requests.exceptions.RequestException as e:
                st.error("Error: An exception occurred while fetching content from the URL.")

        # Check if the user has provided a document
        elif documents is not None:
            for document in documents:
                document_details = {
                    "filename":document.name,
                    "filetype":document.type,
                    "filesize":document.size
                }
                st.write(document_details)

                # Extract content from the txt file
                if document.type == "text/plain":
                    # Read as bytes
                    news += str(document.read(), "utf-8")

                # Extract content from the pdf file
                elif document.type == "application/pdf":
                    # using PyPDF2
                    # news += read_pdf(document)

                    # using pdfplumber
                    try:
                        with pdfplumber.open(document) as pdf:
                            all_text = ""
                            for page in pdf.pages:
                                text = page.extract_text()
                                all_text += text + "\n"
                            news += all_text
                    except:
                        st.write("None")

                # Extract content from the docx file
                else:
                    news += docx2txt.process(document)

            # Display the extracted text content from file
            st.text_area("Extracted Text", value=news, height=200)
        else:
            st.error("Error: An error occurred while fetching content .")



    col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
    with col1:
        url = st.text_input(label='', placeholder="Enter URL")

    with col2:
        documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)

    with col3:
        b = st.button("Enter")

    if b:
        run_function(url, documents)