File size: 6,838 Bytes
e3a7b6f
 
 
a17dc9a
 
e3a7b6f
 
 
d6a8693
e3a7b6f
d6a8693
e3a7b6f
 
d6a8693
e3a7b6f
 
d6a8693
 
 
 
 
e3a7b6f
a17dc9a
e3a7b6f
 
 
d6a8693
4a07537
 
d6a8693
4a07537
d6a8693
e3a7b6f
 
d6a8693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e3a7b6f
 
 
 
 
 
d6a8693
e3a7b6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6a8693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1608585
 
 
d6a8693
 
 
 
1608585
 
e3a7b6f
6fc515c
d6a8693
 
 
e3a7b6f
 
 
 
 
d6a8693
 
 
e3a7b6f
 
 
d6a8693
 
e3a7b6f
 
 
 
 
 
 
 
 
a17dc9a
d6a8693
a17dc9a
 
 
5e2b717
e3a7b6f
 
 
 
 
 
 
e6dda1e
d6a8693
a17dc9a
 
 
d6a8693
a17dc9a
 
 
 
 
 
e3a7b6f
a17dc9a
e3a7b6f
 
 
 
 
a17dc9a
e3a7b6f
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import uvicorn
from dotenv import load_dotenv
from difflib import SequenceMatcher
import re

# Cargar variables de entorno
load_dotenv()

# Inicializar aplicaci贸n FastAPI
app = FastAPI()

# Diccionario global para almacenar los modelos
global_data = {
    'models': []
}

# Configuraci贸n de los modelos
model_configs = [
    {"repo_id": "Ffftdtd5dtft/gpt2-xl-Q2_K-GGUF", "filename": "gpt2-xl-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Instruct-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-instruct-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-9b-it-Q2_K-GGUF", "filename": "gemma-2-9b-it-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Phi-3-mini-128k-instruct-Q2_K-GGUF", "filename": "phi-3-mini-128k-instruct-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-8B-Q2_K-GGUF", "filename": "meta-llama-3.1-8b-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-7B-Instruct-Q2_K-GGUF", "filename": "qwen2-7b-instruct-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/starcoder2-3b-Q2_K-GGUF", "filename": "starcoder2-3b-q2_k.gguf"},
    {"repo_id": "Ffftdtd5dtft/Qwen2-1.5B-Instruct-Q2_K-GGUF", "filename": "qwen2-1.5b-instruct-q2_k.gguf"}
]

# Clase para gestionar modelos
class ModelManager:
    def __init__(self):
        self.models = []
    
    def load_model(self, model_config):
        print(f"Cargando modelo {model_config['repo_id']}...")
        return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
    
    def load_all_models(self):
        print("Iniciando carga de modelos...")
        with ThreadPoolExecutor(max_workers=len(model_configs)) as executor:
            futures = [executor.submit(self.load_model, config) for config in model_configs]
            models = []
            for future in tqdm(as_completed(futures), total=len(model_configs), desc="Cargando modelos", unit="modelo"):
                try:
                    model = future.result()
                    models.append(model)
                    print(f"Modelo cargado exitosamente: {model_configs[len(models)-1]['repo_id']}")
                except Exception as e:
                    print(f"Error al cargar el modelo: {e}")
        print("Todos los modelos han sido cargados.")
        return models

# Instanciar ModelManager y cargar modelos
model_manager = ModelManager()
global_data['models'] = model_manager.load_all_models()

# Modelo global para la solicitud de chat
class ChatRequest(BaseModel):
    message: str
    top_k: int = 50
    top_p: float = 0.95
    temperature: float = 0.7

# Funci贸n para generar respuestas de chat
def generate_chat_response(request, llm):
    try:
        user_input = normalize_input(request.message)
        response = llm.create_chat_completion(
            messages=[{"role": "user", "content": user_input}],
            top_k=request.top_k,
            top_p=request.top_p,
            temperature=request.temperature
        )
        reply = response['choices'][0]['message']['content']
        return {"response": reply, "literal": user_input}
    except Exception as e:
        return {"response": f"Error: {str(e)}", "literal": user_input}

def normalize_input(input_text):
    return input_text.strip()

def remove_duplicates(text):
    # Eliminar patrones repetitivos espec铆ficos
    text = re.sub(r'(Hello there, how are you\? \[/INST\]){2,}', 'Hello there, how are you? [/INST]', text)
    text = re.sub(r'(How are you\? \[/INST\]){2,}', 'How are you? [/INST]', text)
    
    # Eliminar el marcador [/INST]
    text = text.replace('[/INST]', '')
    
    # Generaliza la eliminaci贸n de duplicados
    lines = text.split('\n')
    unique_lines = list(dict.fromkeys(lines))
    return '\n'.join(unique_lines).strip()

def remove_repetitive_responses(responses):
    # Filtra respuestas repetitivas
    seen = set()
    unique_responses = []
    for response in responses:
        normalized_response = remove_duplicates(response)
        if normalized_response not in seen:
            seen.add(normalized_response)
            unique_responses.append(normalized_response)
    return unique_responses

def select_best_response(responses):
    print("Filtrando respuestas...")
    responses = remove_repetitive_responses(responses)
    responses = [remove_duplicates(response) for response in responses]
    unique_responses = list(set(responses))
    coherent_responses = filter_by_coherence(unique_responses)
    best_response = filter_by_similarity(coherent_responses)
    return best_response

def filter_by_coherence(responses):
    # Ordenar respuestas por longitud y similaridad para coherencia b谩sica
    print("Ordenando respuestas por coherencia...")
    responses.sort(key=len, reverse=True)
    return responses

def filter_by_similarity(responses):
    # Seleccionar la respuesta m谩s coherente y 煤nica
    print("Filtrando respuestas por similitud...")
    responses.sort(key=len, reverse=True)
    best_response = responses[0]
    for i in range(1, len(responses)):
        ratio = SequenceMatcher(None, best_response, responses[i]).ratio()
        if ratio < 0.9:
            best_response = responses[i]
            break
    return best_response

def worker_function(llm, request, progress_bar):
    print(f"Generando respuesta con el modelo {llm}...")
    response = generate_chat_response(request, llm)
    progress_bar.update(1)
    return response

@app.post("/generate_chat")
async def generate_chat(request: ChatRequest):
    if not request.message.strip():
        raise HTTPException(status_code=400, detail="The message cannot be empty.")
    
    print(f"Procesando solicitud: {request.message}")

    responses = []
    num_models = len(global_data['models'])

    with tqdm(total=num_models, desc="Generando respuestas", unit="modelo") as progress_bar:
        with ThreadPoolExecutor(max_workers=num_models) as executor:
            futures = [executor.submit(worker_function, llm, request, progress_bar) for llm in global_data['models']]
            for future in as_completed(futures):
                try:
                    response = future.result()
                    responses.append(response['response'])
                except Exception as exc:
                    print(f"Error en la generaci贸n de respuesta: {exc}")

    best_response = select_best_response(responses)
    
    print(f"Mejor respuesta seleccionada: {best_response}")

    return {
        "best_response": best_response,
        "all_responses": responses
    }

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)