import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" from datasets import load_dataset import pandas as pd import numpy as np from tensorflow.keras.models import Sequential, load_model from tensorflow.keras.layers import Embedding, LSTM, Dense from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.losses import sparse_categorical_crossentropy import gradio as gr # Load dataset dataset = load_dataset("Cosmos-AI/Cosmos-dataset") # Convert dataset to pandas DataFrame dataset_df = pd.DataFrame(dataset['train']) # Assuming 'train' split contains both questions and answers # Prepare data questions = dataset_df['Question'].astype(str).tolist() answers = dataset_df['Answer'].astype(str).tolist() # Tokenize input data tokenizer = Tokenizer(lower=False, oov_token="", filters='\t\n') tokenizer.fit_on_texts(questions + answers) word_index = tokenizer.word_index # Convert text sequences to numerical sequences question_sequences = tokenizer.texts_to_sequences(questions) answer_sequences = tokenizer.texts_to_sequences(answers) # Pad sequences to ensure uniform length max_sequence_length = max(len(seq) for seq in question_sequences + answer_sequences) print("MAX SEQUENCE LENGTH: " + str(max_sequence_length)) question_sequences = pad_sequences(question_sequences, maxlen=max_sequence_length, padding='post') answer_sequences = pad_sequences(answer_sequences, maxlen=max_sequence_length, padding='post') # Convert target sequences to one-hot encoding num_words = len(word_index) + 1 one_hot_answers = np.zeros((len(answer_sequences), max_sequence_length, num_words), dtype=np.float32) for i, sequence in enumerate(answer_sequences): for t, index in enumerate(sequence): one_hot_answers[i, t, index] = 1 # Define model model = Sequential([ Embedding(len(word_index) + 1, 64), # Removed input_length parameter LSTM(64, return_sequences=True), Dense(len(word_index) + 1, activation='softmax') ]) # Compile model with correct loss function model.compile(loss=sparse_categorical_crossentropy, optimizer='adam', metrics=['accuracy']) # Train model model.fit(question_sequences, answer_sequences, epochs=1000, batch_size=32, steps_per_epoch=1) # Function to generate response def generate_response(input_text): # Tokenize input text input_sequence = tokenizer.texts_to_sequences([input_text]) input_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length, padding='post') # Generate response predicted_sequence = model.predict(input_sequence) # Decode predicted sequence response = "" for timestep in predicted_sequence[0]: predicted_word_index = np.argmax(timestep) if predicted_word_index in word_index.values(): predicted_word = next(word for word, idx in word_index.items() if idx == predicted_word_index) if predicted_word == 'eos': # 'eos' marks the end of the sequence break response += predicted_word + " " else: response += ' ' # If predicted index not found in word_index return response.strip() # Interface input_text = gr.inputs.Textbox(label="Input Text") output_text = gr.outputs.Textbox(label="Output Text") gr.Interface(fn=generate_response, inputs=input_text, outputs=output_text, title="Conversation Model", description="Enter your question and get a response.").launch()