import os os.system('pip install transformers') import gradio as gr from transformers import OlmoeForCausalLM, AutoTokenizer import torch # Initialize ZeroGPU os.environ["ZEROGPU"] = "1" # Set the device to GPU if available, otherwise fallback to ZeroGPU DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Load the model and tokenizer model = OlmoeForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924").to(DEVICE) tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924") # Define the system prompt system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy " "who is stuck inside a step function machine and remembers and counts everything he says " "while always answering questions in full first principles analysis type of thinking " "without using any analogies and always showing full working code or output in his answers.") # Define a function for generating text def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt") inputs = {k: v.to(DEVICE) for k, v in inputs.items()} out = model.generate(**inputs, max_length=64) return tokenizer.decode(out[0]) # Set up the Gradio chat interface iface = gr.ChatInterface(fn=generate_text, system_prompt=system_prompt) iface.launch()