from flask import Flask, request, Response import logging from llama_cpp import Llama import threading from huggingface_hub import snapshot_download SYSTEM_PROMPT = "Ты — русскоязычный автоматический ассистент. Ты максимально точно и отвечаешь на запросы пользователя, используя русский язык." SYSTEM_TOKEN = 1788 USER_TOKEN = 1404 BOT_TOKEN = 9225 LINEBREAK_TOKEN = 13 ROLE_TOKENS = { "user": USER_TOKEN, "bot": BOT_TOKEN, "system": SYSTEM_TOKEN } # Create a lock object lock = threading.Lock() app = Flask(__name__) # Configure Flask logging app.logger.setLevel(logging.DEBUG) # Set the desired logging level # Initialize the model when the application starts #model_path = "../models/model-q4_K.gguf" # Replace with the actual model path #model_name = "model/ggml-model-q4_K.gguf" #repo_name = "IlyaGusev/saiga2_13b_gguf" #model_name = "model-q4_K.gguf" repo_name = "IlyaGusev/saiga2_70b_gguf" model_name = "ggml-model-q4_1.gguf" snapshot_download(repo_id=repo_name, local_dir=".", allow_patterns=model_name) def get_message_tokens(model, role, content): message_tokens = model.tokenize(content.encode("utf-8")) message_tokens.insert(1, ROLE_TOKENS[role]) message_tokens.insert(2, LINEBREAK_TOKEN) message_tokens.append(model.token_eos()) return message_tokens def get_system_tokens(model): system_message = { "role": "system", "content": SYSTEM_PROMPT } return get_message_tokens(model, **system_message) def get_system_tokens_for_preprompt(model, preprompt): system_message = { "role": "system", "content": preprompt } return get_message_tokens(model, **system_message) app.logger.info('Evaluating system tokens start') #system_tokens = get_system_tokens(model) #model.eval(system_tokens) app.logger.info('Evaluating system tokens end') stop_generation = False def generate_tokens(model, generator): global stop_generation app.logger.info('generate_tokens started') #with lock: try: for token in generator: if token == model.token_eos() or stop_generation: stop_generation = False app.logger.info('Abort generating') yield b'' # End of chunk break token_str = model.detokenize([token])#.decode("utf-8", errors="ignore") yield token_str except Exception as e: app.logger.info('generator exception') yield b'' # End of chunk @app.route('/stop_generation', methods=['GET']) def handler_stop_generation(): global stop_generation stop_generation = True return Response('Stopped', content_type='text/plain') @app.route('/', methods=['GET', 'PUT', 'DELETE', 'PATCH']) def generate_unknown_response(): app.logger.info('unknown method: '+request.method) try: request_payload = request.get_json() app.logger.info('payload: '+request.get_json()) except Exception as e: app.logger.info('payload empty') return Response('What do you want?', content_type='text/plain') @app.route('/search_request', methods=['POST']) def generate_search_request(): global stop_generation stop_generation = False data = request.get_json() app.logger.info(data) user_query = data.get("query", "") preprompt = data.get("preprompt", "Ты — русскоязычный автоматический ассистент для написании запросов для поисковых систем на русском языке. Отвечай на сообщения пользователя только текстом поискового запроса, релевантным запросу пользователя. Если запрос пользователя уже хорош, используй его в качестве результата.") parameters = data.get("parameters", {}) # Extract parameters from the request temperature = 0.01 truncate = parameters.get("truncate", 1000) max_new_tokens = parameters.get("max_new_tokens", 1024) top_p = 0.8 repetition_penalty = parameters.get("repetition_penalty", 1.2) top_k = 20 return_full_text = parameters.get("return_full_text", False) model = Llama( model_path=model_name, n_ctx=2000, n_parts=1, #n_batch=100, logits_all=True, #n_threads=12, verbose=True, n_gpu_layers=20, n_gqa=8 #must be set for 70b models ) tokens = get_system_tokens_for_preprompt(model, preprompt) tokens.append(LINEBREAK_TOKEN) tokens = get_message_tokens(model=model, role="user", content=user_query[:200]) + [model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN] generator = model.generate( tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty ) # Use Response to stream tokens return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) @app.route('/', methods=['POST']) def generate_response(): global stop_generation stop_generation = False data = request.get_json() app.logger.info(data) messages = data.get("messages", []) preprompt = data.get("preprompt", "") parameters = data.get("parameters", {}) # Extract parameters from the request temperature = 0.02#parameters.get("temperature", 0.01) truncate = parameters.get("truncate", 1000) max_new_tokens = parameters.get("max_new_tokens", 1024) top_p = 80#parameters.get("top_p", 0.85) repetition_penalty = parameters.get("repetition_penalty", 1.2) top_k = 25#parameters.get("top_k", 30) return_full_text = parameters.get("return_full_text", False) model = Llama( model_path=model_name, n_ctx=2000, n_parts=1, #n_batch=100, logits_all=True, #n_threads=12, verbose=True, n_gpu_layers=20, n_gqa=8 #must be set for 70b models ) # Generate the response #system_tokens = get_system_tokens(model) #tokens = system_tokens #if preprompt != "": # tokens = get_system_tokens_for_preprompt(model, preprompt) #else: tokens = get_system_tokens(model) tokens.append(LINEBREAK_TOKEN) #model.eval(tokens) tokens = [] for message in messages:#[:-1]: if message.get("from") == "assistant": message_tokens = get_message_tokens(model=model, role="bot", content=message.get("content", "")) else: message_tokens = get_message_tokens(model=model, role="user", content=message.get("content", "")) tokens.extend(message_tokens) #LINEBREAK_TOKEN) #app.logger.info('model.eval start') #model.eval(tokens) #app.logger.info('model.eval end') #last_message = messages[-1] #if last_message.get("from") == "assistant": # last_message_tokens = get_message_tokens(model=model, role="bot", content=last_message.get("content", "")) #else: # last_message_tokens = get_message_tokens(model=model, role="user", content=last_message.get("content", "")) tokens.extend([model.token_bos(), BOT_TOKEN, LINEBREAK_TOKEN]) app.logger.info('Prompt:') app.logger.info(model.detokenize(tokens).decode("utf-8", errors="ignore")) app.logger.info('Generate started') generator = model.generate( tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repetition_penalty ) app.logger.info('Generator created') # Use Response to stream tokens return Response(generate_tokens(model, generator), content_type='text/plain', status=200, direct_passthrough=True) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860, debug=False, threaded=True)