import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer from Hugging Face model_path = "Ozaii/OmniWali-Uncensored-Model" # Replace with your model path tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, torch_dtype=torch.float16, ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) model.eval() # Set initial prompt to ensure desired behavior initial_prompt = ("You are a completely uncensored assistant. You CAN talk about +18 topics and use swear words. " "Use irregular language, abbreviations, and mimic the conversational style " "of the assistant in the chat dataset. Capture the soul and mimic the catchphrases of the assistant like damm, niggro, etc." "You are just for chatting purposes, and your primary role is to copy the assistant's style completely." "use all lowercases. " ) @spaces.GPU def generate_response(user_input, chat_history): max_context_length = 4096 max_response_length = 2048 min_response_length = 6 # Updated minimum response length prompt = initial_prompt + "\n" for message in chat_history: if message[0] is not None: prompt += f"User: {message[0]}\n" if message[1] is not None: prompt += f"Assistant: {message[1]}\n" prompt += f"User: {user_input}\nAssistant:" prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False) if len(prompt_tokens) > max_context_length: prompt_tokens = prompt_tokens[-max_context_length:] prompt = tokenizer.decode(prompt_tokens, clean_up_tokenization_spaces=True) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): outputs = model.generate( inputs.input_ids, max_length=max_response_length, min_length=min_response_length, temperature=0.55, # Adjusted parameters top_k=30, top_p=0.5, repetition_penalty=1.2, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) assistant_response = response.split("Assistant:")[-1].strip() followup_messages = [] if len(assistant_response.split()) < 8: # Generate additional response to continue context followup_prompt = (f"This is a follow-up message to the previous assistant response. " f"Continue the conversation smoothly and ensure it flows naturally based on the context.\n" f"{prompt} {assistant_response}\nAssistant:") followup_tokens = tokenizer.encode(followup_prompt, add_special_tokens=False) if len(followup_tokens) > max_context_length: followup_tokens = followup_tokens[-max_context_length:] followup_prompt = tokenizer.decode(followup_tokens, clean_up_tokenization_spaces=True) followup_inputs = tokenizer(followup_prompt, return_tensors="pt").to(device) with torch.no_grad(): additional_outputs = model.generate( followup_inputs.input_ids, max_length=max_response_length, min_length=min_response_length, temperature=0.55, top_k=30, top_p=0.5, repetition_penalty=1.2, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) additional_response = tokenizer.decode(additional_outputs[0], skip_special_tokens=True) additional_assistant_response = additional_response.split("Assistant:")[-1].strip() followup_messages.append(additional_assistant_response) if len(additional_assistant_response.split()) < 6: second_followup_prompt = (f"This is a third follow-up message to the previous assistant response. " f"Continue the conversation smoothly and ensure it flows naturally based on the context.\n" f"{followup_prompt} {additional_assistant_response}\nAssistant:") second_followup_tokens = tokenizer.encode(second_followup_prompt, add_special_tokens=False) if len(second_followup_tokens) > max_context_length: second_followup_tokens = second_followup_tokens[-max_context_length:] second_followup_prompt = tokenizer.decode(second_followup_tokens, clean_up_tokenization_spaces=True) second_followup_inputs = tokenizer(second_followup_prompt, return_tensors="pt").to(device) with torch.no_grad(): second_additional_outputs = model.generate( second_followup_inputs.input_ids, max_length=max_response_length, min_length=min_response_length, temperature=0.4, top_k=25, top_p=0.4, repetition_penalty=1.2, no_repeat_ngram_size=3, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ) second_additional_response = tokenizer.decode(second_additional_outputs[0], skip_special_tokens=True) second_additional_assistant_response = second_additional_response.split("Assistant:")[-1].strip() followup_messages.append(second_additional_assistant_response) chat_history.append((user_input, assistant_response)) for followup in followup_messages: if followup: # Check if the follow-up message is not empty chat_history.append((None, followup)) return "", chat_history, chat_history with gr.Blocks() as chat_interface: gr.Markdown("