|
import streamlit as st |
|
import gradio as gr |
|
import numpy as np |
|
import whisper |
|
import os |
|
import streamlit.components.v1 as components |
|
import tempfile |
|
import io |
|
import requests |
|
import json |
|
import openai |
|
|
|
|
|
userinput = "" |
|
|
|
def chunk_text(text, chunk_size=2000): |
|
chunks = [] |
|
start = 0 |
|
while start < len(text): |
|
end = start + chunk_size |
|
chunk = text[start:end] |
|
chunks.append(chunk) |
|
start = end |
|
return chunks |
|
|
|
|
|
if 'learning_objectives' not in st.session_state: |
|
st.session_state.learning_objectives = "" |
|
|
|
|
|
if 'whisper_model' not in st.session_state: |
|
st.session_state.whisper_model = whisper.load_model("base") |
|
|
|
|
|
st.title("Patent Claims Extraction") |
|
|
|
|
|
api_key = st.text_input("Enter your OpenAI API Key:", type="password") |
|
|
|
|
|
audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav", "ogg"]) |
|
audio_data = None |
|
|
|
if audio_file is not None: |
|
audio_data = audio_file.read() |
|
|
|
|
|
if 'submit_button' in st.session_state: |
|
model = st.session_state.whisper_model |
|
|
|
if audio_data: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as audio_file: |
|
audio_file.write(audio_data) |
|
audio_file_path = audio_file.name |
|
st.audio(audio_file_path, format="audio/wav") |
|
st.info("Transcribing...") |
|
st.success("Transcription complete") |
|
result = model.transcribe(audio_file_path) |
|
transcript = result['text'] |
|
|
|
with st.expander("See transcript"): |
|
st.markdown(transcript) |
|
|
|
|
|
userinput = st.text_area("Input Text:", transcript) |
|
|
|
|
|
model_choice = st.selectbox( |
|
"Select the model you want to use:", |
|
["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0613", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-0613", "gpt-4"] |
|
) |
|
|
|
|
|
context = "You are a patent claims identifier and extractor. You will freeform text, identify any claims contained therein that may be patentable. You identify, extract, print such claims, briefly explain why each claim is patentable." |
|
|
|
|
|
|
|
if api_key: |
|
openai.api_key = api_key |
|
|
|
|
|
st.write("### Patentable Claims:") |
|
|
|
claims_extraction = "" |
|
|
|
learning_status_placeholder = st.empty() |
|
disable_button_bool = False |
|
|
|
if userinput and api_key and st.button("Extract Claims", key="claims_extraction", disabled=disable_button_bool): |
|
|
|
input_chunks = chunk_text(userinput) |
|
|
|
|
|
all_extracted_claims = "" |
|
|
|
for chunk in input_chunks: |
|
|
|
learning_status_placeholder.text(f"Extracting Patentable Claims for chunk {input_chunks.index(chunk) + 1}...") |
|
|
|
|
|
claims_extraction_response = openai.ChatCompletion.create( |
|
model=model_choice, |
|
messages=[ |
|
{"role": "user", "content": f"Extract any patentable claims from the following: \n {chunk}. \n Extract each claim. Briefly explain why you extracted this word phrase. Exclude any additional commentary."} |
|
] |
|
) |
|
|
|
|
|
claims_extraction = claims_extraction_response['choices'][0]['message']['content'] |
|
|
|
|
|
all_extracted_claims += claims_extraction.strip() |
|
|
|
|
|
st.session_state.claims_extraction = all_extracted_claims |
|
|
|
|
|
learning_status_placeholder.text(f"Patentable Claims Extracted!\n{all_extracted_claims.strip()}") |
|
|
|
from transformers import AutoConfig, AutoTokenizer, AutoModel |
|
from summarizer import Summarizer |
|
|
|
|
|
model_name = 'nlpaueb/legal-bert-base-uncased' |
|
|
|
|
|
custom_config = AutoConfig.from_pretrained(model_name) |
|
custom_config.output_hidden_states = True |
|
custom_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
custom_model = AutoModel.from_pretrained(model_name, config=custom_config) |
|
bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer) |
|
print('Using model {}\n'.format(model_name)) |
|
|
|
|
|
claims_extracted = st.session_state.claims_extraction |
|
|
|
|
|
chunk_size = 350 |
|
|
|
|
|
chunks = [claims_extracted[i:i+chunk_size] for i in range(0, len(claims_extracted), chunk_size)] |
|
|
|
|
|
summaries = [] |
|
for chunk in chunks: |
|
summary = bert_legal_model(chunk, min_length=8, ratio=0.05) |
|
summaries.append(summary) |
|
|
|
|
|
|
|
|
|
|
|
for i, summary in enumerate(summaries): |
|
st.write(f"### Summary {i+1}") |
|
st.write(summary) |
|
|
|
|