|
import os |
|
import re |
|
import json |
|
import time |
|
import requests |
|
import gradio as gr |
|
|
|
import google.auth |
|
from google.auth.transport.requests import Request |
|
|
|
import google.generativeai as genai |
|
|
|
genai.configure(api_key=os.environ.get("GEMINI_API_KEY")) |
|
|
|
def upload_to_gemini(path, mime_type=None): |
|
file = genai.upload_file(path, mime_type=mime_type) |
|
print(f"Uploaded file '{file.display_name}' as: {file.uri}") |
|
return file |
|
|
|
generation_config = { |
|
"temperature": 1, |
|
"top_p": 0.95, |
|
"top_k": 64, |
|
"max_output_tokens": 1_048_576, |
|
"response_mime_type": "text/plain", |
|
} |
|
|
|
safety_settings = [ |
|
{ |
|
"category": "HARM_CATEGORY_HARASSMENT", |
|
"threshold": "BLOCK_NONE", |
|
}, |
|
{ |
|
"category": "HARM_CATEGORY_HATE_SPEECH", |
|
"threshold": "BLOCK_NONE", |
|
}, |
|
{ |
|
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", |
|
"threshold": "BLOCK_NONE", |
|
}, |
|
{ |
|
"category": "HARM_CATEGORY_DANGEROUS_CONTENT", |
|
"threshold": "BLOCK_NONE", |
|
}, |
|
] |
|
|
|
model = genai.GenerativeModel( |
|
model_name="gemini-1.5-pro-latest", |
|
safety_settings=safety_settings, |
|
generation_config=generation_config, |
|
system_instruction="Act as a language model trained on a specific style of writing that incorporates both Roman and Devanagari script", |
|
) |
|
|
|
|
|
def generate_transliteration_gemini_15_pro(text): |
|
texts = [text] |
|
transliteration_example_file = upload_to_gemini( |
|
"ai_exp_json.txt", mime_type="text/plain" |
|
) |
|
|
|
chat_session = model.start_chat( |
|
history=[ |
|
{ |
|
"role": "user", |
|
"parts": [ |
|
"Given a sentence in Roman written English and a set of pre-defined patterns, transliterate only specific words to Devanagari script while maintaining a desired ratio between Roman and Devanagari words. Your task is to transliterate only a subset of words while maintaining the overall meaning and sentence structure.\n", |
|
'Based on a provided English sentence and a desired transliteration ratio, use your knowledge of this unique style to select words for transliteration that enhance the overall message and aesthetic. I will provide you with training examples to understand the preferred approach.\nGo through the examples in the file in following JSON format: [{"English": xxx, "Transliteration"}]." and Develop a system that can intelligently choose which English words to transliterate into Devanagari in a sentence, aiming for a specific ratio between the two scripts. With the help of examples in Json format file, design a system that can learn the optimal ratio and transliteration pattern.', |
|
transliteration_example_file, |
|
], |
|
}, |
|
] |
|
) |
|
response = chat_session.send_message( |
|
'Given an English sentences: \n```' + "\n".join(texts) + '\n```\nTransliterate English sentences into a mix of Roman and Devanagari script, following a predefined pattern or learning from provided examples above without explain anything.\nReturn output in JSON in following format for the list of sentences: {"text": xxx, "transliterate": xxx}' |
|
) |
|
clean_text = lambda res: res.replace("```json", "").replace("```", "").replace("\n", "") |
|
print(response.text) |
|
data = json.loads(clean_text(response.text)) |
|
if type(data) is list: |
|
data = data[0] |
|
return clean_hindi_transliterated_text(data["transliterate"]) |
|
|
|
|
|
|
|
def update_text_from_dictionary(text, dictionary_path="./en_hi.dict", initial_lookup=True): |
|
if not dictionary_path: |
|
return text |
|
|
|
with open(dictionary_path) as f: |
|
lines = f.read().splitlines() |
|
|
|
updated_lines = list(map(lambda x: x.split("|"), lines)) |
|
|
|
initial_pass_dict = {} |
|
final_pass_dict = {} |
|
for initial, incorrect, correct in updated_lines: |
|
initial_pass_dict[initial] = correct |
|
initial_pass_dict[initial+"."] = correct+"." |
|
initial_pass_dict[initial+"?"] = correct+"?" |
|
initial_pass_dict[initial+","] = correct+"," |
|
final_pass_dict[incorrect] = correct |
|
final_pass_dict[incorrect+"."] = correct+"." |
|
final_pass_dict[incorrect+"?"] = correct+"?" |
|
final_pass_dict[incorrect+","] = correct+"," |
|
|
|
|
|
if initial_lookup: |
|
print(f"Original [{initial_lookup}]: ", text) |
|
|
|
new_text = " ".join([initial_pass_dict.get(t, t) for t in text.split()]) |
|
print(f"New [{initial_lookup}]: ", new_text) |
|
else: |
|
print(f"Original [{initial_lookup}]: ", text) |
|
|
|
new_text = " ".join([final_pass_dict.get(t, t) for t in text.split()]) |
|
print(f"New [{initial_lookup}]: ", new_text) |
|
return new_text |
|
|
|
|
|
def get_google_token(): |
|
credentials, project = google.auth.load_credentials_from_dict( |
|
json.loads(os.environ.get('GCP_FINETUNE_KEY')), |
|
scopes=[ |
|
"https://www.googleapis.com/auth/cloud-platform", |
|
"https://www.googleapis.com/auth/generative-language.tuning", |
|
], |
|
) |
|
request = Request() |
|
credentials.refresh(request) |
|
access_token = credentials.token |
|
return access_token |
|
|
|
|
|
def transliterate_first_word(text): |
|
texts = text.split(maxsplit=1) |
|
if len(texts) > 1: |
|
first_word, rest = texts |
|
else: |
|
first_word, rest = texts[0], "" |
|
if not first_word.isalnum(): |
|
return text |
|
|
|
url = "https://inputtools.google.com/request" |
|
n=1 |
|
params = { |
|
"text": first_word, |
|
"num": n, |
|
"itc": "hi-t-i0-und", |
|
"cp": 0, |
|
"cs": 1, |
|
"ie": "utf-8", |
|
"app": "demopage" |
|
} |
|
response = requests.get(url, params=params) |
|
results = response.json()[1][0][1] |
|
first_word_transliterated = results[0] |
|
return f"{first_word_transliterated} {rest}" |
|
|
|
|
|
def clean(result): |
|
text = result["choices"][0]['message']["content"] |
|
text = re.sub(r"\(.*?\)|\[.*?\]","", text) |
|
text = text.strip("'").replace('"', "").replace('`', "") |
|
if "\n" in text.strip("\n"): |
|
text = text.split("\n")[-1] |
|
return clean_hindi_transliterated_text(text) |
|
|
|
|
|
def clean_hindi_transliterated_text(text): |
|
updates = [('ऑ', 'औ'), ('ॉ', 'ौ'), ('ॅ', 'े'), ("{", ""), ("}", ""), ("'text'", ""), (":", "")] |
|
text = text.replace('`', '').replace("output:", "") |
|
for o, n in updates: |
|
text = text.replace(o, n) |
|
final_text = text.strip().strip("'").strip('"') |
|
result_text = update_text_from_dictionary(final_text, initial_lookup=False) |
|
return result_text |
|
|
|
|
|
|
|
|
|
|
|
def dubpro_english_transliteration(text, call_gpt): |
|
if call_gpt: |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" |
|
} |
|
|
|
text = update_text_from_dictionary(text, initial_lookup=True) |
|
|
|
prompt = f"Given the English text, transliterate it to Hindi, without translation. Return only the transliterated text, without any instruction or messages. Text: `{text}`\nOutput: " |
|
messages = [ |
|
{"role": "user", "content": prompt} |
|
] |
|
resp = None |
|
while resp is None: |
|
resp = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json={ |
|
"model": "gpt-4o-2024-05-13", |
|
"messages": messages |
|
}) |
|
if resp.status_code != 200: |
|
print(resp.text) |
|
time.sleep(0.5) |
|
return clean(resp.json()) |
|
else: |
|
return generate_transliteration_gemini_15_pro(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_rephrases_gemini(text, language, problem): |
|
API_URL = os.environ.get("GEMINI_REPHRASER_API") |
|
BEARER_TOKEN = get_google_token() |
|
headers = { |
|
"Authorization": f"Bearer {BEARER_TOKEN}", |
|
"Content-Type": "application/json", |
|
} |
|
if problem == "Gap": |
|
speak = "more" |
|
else: |
|
speak = "less" |
|
if language == "English": |
|
prompt = f"You are an English and Hindi language expert, please rephrase a sentence that has been translated from Hindi to English so that it takes little {speak} time to speak." |
|
elif language == "Hindi": |
|
prompt = f"You are a hindi language expert please rephrase the below line without summary so that it takes little {speak} time to speak in hinglish manner." |
|
|
|
payload = { |
|
"contents": [ |
|
{ |
|
"parts": [ |
|
{ |
|
"text": prompt |
|
}, |
|
{ |
|
"text": f"input: {text}" |
|
}, |
|
{ |
|
"text": f"output: " |
|
} |
|
], |
|
"role": "user", |
|
} |
|
], |
|
"generationConfig": { |
|
"maxOutputTokens": 8192, |
|
"temperature": 0.85, |
|
"candidateCount": 1, |
|
}, |
|
"safetySettings": [ |
|
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, |
|
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, |
|
], |
|
} |
|
result = requests.post(url=API_URL, headers=headers, json=payload) |
|
response = result.json() |
|
output_text = response["candidates"][0]["content"]["parts"][0]["text"] |
|
|
|
texts = list(map(lambda x: x.replace("-", "").strip(), output_text.split("\n"))) |
|
texts = "\n".join(texts) |
|
|
|
|
|
wc = f"Original Word Count: {len(text.split())}\nRephrased Word Count: {len(texts.split())}" |
|
|
|
return texts, wc |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Translator Assistance Tools") |
|
with gr.Tab("Transliteration"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_text = gr.Textbox(label="Input text", info="Please enter English text.") |
|
full_transliteration = gr.Checkbox(label="Full transliteration", value=True) |
|
output_text = gr.Textbox(label="Output text") |
|
transliterate = gr.Button("Submit") |
|
transliterate.click(dubpro_english_transliteration, [input_text, full_transliteration], output_text) |
|
|
|
with gr.Tab("Rephraser Tool"): |
|
with gr.Row(): |
|
rephrase_text = gr.Textbox(label="Input text", info="Please enter text.") |
|
language = gr.Dropdown(["English", "Hindi"], value="Hindi") |
|
solving_for = gr.Dropdown(["Gap", "Overflow"], value="Overflow", label="Solving for:") |
|
with gr.Row(): |
|
word_count = gr.Textbox(label="Word count") |
|
rephrased_text = gr.Textbox(label="Output text") |
|
rephrase = gr.Button("Submit") |
|
rephrase.click(generate_rephrases_gemini, [rephrase_text, language, solving_for], [rephrased_text, word_count]) |
|
|
|
|
|
demo.launch(auth=(os.environ.get("USERNAME"), os.environ.get("PASSWORD"))) |