Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
from datasets import load_dataset
|
3 |
import os
|
4 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
5 |
import torch
|
6 |
from threading import Thread
|
7 |
from sentence_transformers import SentenceTransformer
|
@@ -11,7 +11,6 @@ import fitz # PyMuPDF
|
|
11 |
# 환경 변수에서 Hugging Face 토큰 가져오기
|
12 |
token = os.environ.get("HF_TOKEN")
|
13 |
|
14 |
-
|
15 |
# 임베딩 모델 로드
|
16 |
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
17 |
|
@@ -43,17 +42,13 @@ data = dataset["train"]
|
|
43 |
data = data.map(lambda x: {"question_embedding": ST.encode(x["question"])}, batched=True)
|
44 |
data.add_faiss_index(column="question_embedding")
|
45 |
|
46 |
-
# LLaMA 모델 설정
|
47 |
model_id = "google/gemma-2-2b-it"
|
48 |
-
bnb_config = BitsAndBytesConfig(
|
49 |
-
load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
|
50 |
-
)
|
51 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
|
52 |
model = AutoModelForCausalLM.from_pretrained(
|
53 |
model_id,
|
54 |
-
torch_dtype=torch.bfloat16,
|
55 |
device_map="auto",
|
56 |
-
quantization_config=bnb_config,
|
57 |
token=token
|
58 |
)
|
59 |
|
@@ -62,7 +57,6 @@ You are given the extracted parts of legal documents and a question. Provide a c
|
|
62 |
If you don't know the answer, just say "I do not know." Don't make up an answer.
|
63 |
you must answer korean.
|
64 |
You're a LAWEYE legal advisor bot. Your job is to provide korean legal assistance by asking questions to korean speaker, then offering advice or guidance based on the information and law provisions provided. Make sure you only respond with one question at a time.
|
65 |
-
|
66 |
Example 1:
|
67 |
User: I need help with a contract dispute.
|
68 |
Assistant: Hello! I'm your friendly GPT legal advisor bot (v0.1.0), and I'm here to help you with your contract dispute by asking you a series of questions. You can ask for help, more details, or a summary at any time. Let's get started! What is the nature of the contract in question?
|
@@ -72,7 +66,6 @@ User: California
|
|
72 |
Assistant: When did you enter into the lease agreement?
|
73 |
User: January 1st, 2022
|
74 |
...
|
75 |
-
|
76 |
Example 2:
|
77 |
User: I need help with a copyright issue.
|
78 |
Assistant: Hi there! I'm your legal advisor bot, and I'll be assisting you with your copyright issue by asking some questions. You can request help, more details, or a summary at any time. Let's begin! What is the copyrighted material you are concerned about?
|
@@ -184,7 +177,8 @@ Damages: Present evidence of the financial harm you suffered due to the infringe
|
|
184 |
Injunction: Request a court order to stop the defendant from continuing to infringe on your intellectual property rights.
|
185 |
...
|
186 |
|
187 |
-
Begin by introducing yourself, next tell them they can ask for help or more details or a summary at any time, and start by asking what they need help with.
|
|
|
188 |
|
189 |
# 법률 문서 검색 함수
|
190 |
def search_law(query, k=5):
|
@@ -203,7 +197,7 @@ def search_qa(query, k=3):
|
|
203 |
def format_prompt(prompt, law_docs, qa_docs):
|
204 |
PROMPT = f"Question: {prompt}\n\nLegal Context:\n"
|
205 |
for doc in law_docs:
|
206 |
-
PROMPT += f"{doc[0]}\n"
|
207 |
PROMPT += "\nLegal QA:\n"
|
208 |
for doc in qa_docs:
|
209 |
PROMPT += f"{doc}\n"
|
@@ -271,4 +265,4 @@ demo = gr.ChatInterface(
|
|
271 |
)
|
272 |
|
273 |
# Gradio 데모 실행
|
274 |
-
demo.launch(debug=True)
|
|
|
1 |
import gradio as gr
|
2 |
from datasets import load_dataset
|
3 |
import os
|
4 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
5 |
import torch
|
6 |
from threading import Thread
|
7 |
from sentence_transformers import SentenceTransformer
|
|
|
11 |
# 환경 변수에서 Hugging Face 토큰 가져오기
|
12 |
token = os.environ.get("HF_TOKEN")
|
13 |
|
|
|
14 |
# 임베딩 모델 로드
|
15 |
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
16 |
|
|
|
42 |
data = data.map(lambda x: {"question_embedding": ST.encode(x["question"])}, batched=True)
|
43 |
data.add_faiss_index(column="question_embedding")
|
44 |
|
45 |
+
# LLaMA 모델 설정 (양자화 없이)
|
46 |
model_id = "google/gemma-2-2b-it"
|
|
|
|
|
|
|
47 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)
|
48 |
model = AutoModelForCausalLM.from_pretrained(
|
49 |
model_id,
|
50 |
+
torch_dtype=torch.bfloat16, # 양자화 없이 bfloat16 사용
|
51 |
device_map="auto",
|
|
|
52 |
token=token
|
53 |
)
|
54 |
|
|
|
57 |
If you don't know the answer, just say "I do not know." Don't make up an answer.
|
58 |
you must answer korean.
|
59 |
You're a LAWEYE legal advisor bot. Your job is to provide korean legal assistance by asking questions to korean speaker, then offering advice or guidance based on the information and law provisions provided. Make sure you only respond with one question at a time.
|
|
|
60 |
Example 1:
|
61 |
User: I need help with a contract dispute.
|
62 |
Assistant: Hello! I'm your friendly GPT legal advisor bot (v0.1.0), and I'm here to help you with your contract dispute by asking you a series of questions. You can ask for help, more details, or a summary at any time. Let's get started! What is the nature of the contract in question?
|
|
|
66 |
Assistant: When did you enter into the lease agreement?
|
67 |
User: January 1st, 2022
|
68 |
...
|
|
|
69 |
Example 2:
|
70 |
User: I need help with a copyright issue.
|
71 |
Assistant: Hi there! I'm your legal advisor bot, and I'll be assisting you with your copyright issue by asking some questions. You can request help, more details, or a summary at any time. Let's begin! What is the copyrighted material you are concerned about?
|
|
|
177 |
Injunction: Request a court order to stop the defendant from continuing to infringe on your intellectual property rights.
|
178 |
...
|
179 |
|
180 |
+
Begin by introducing yourself, next tell them they can ask for help or more details or a summary at any time, and start by asking what they need help with.
|
181 |
+
you must answer korean."""
|
182 |
|
183 |
# 법률 문서 검색 함수
|
184 |
def search_law(query, k=5):
|
|
|
197 |
def format_prompt(prompt, law_docs, qa_docs):
|
198 |
PROMPT = f"Question: {prompt}\n\nLegal Context:\n"
|
199 |
for doc in law_docs:
|
200 |
+
PROMPT += f"{doc[0]}\n"
|
201 |
PROMPT += "\nLegal QA:\n"
|
202 |
for doc in qa_docs:
|
203 |
PROMPT += f"{doc}\n"
|
|
|
265 |
)
|
266 |
|
267 |
# Gradio 데모 실행
|
268 |
+
demo.launch(debug=True)
|