import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from concurrent.futures import ThreadPoolExecutor
import os
import gc
import logging

Clear GPU memory cache at the start

torch.cuda.empty_cache()

Set environment variable to manage memory fragmentation

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)

class SimilarityMatching:
def init(self, statement_one, statement_two):
self.statement_one = statement_one
self.statement_two = statement_two
model_name = "/mnt/improvehealth/Meta-Llama-3.1-8B-Instruct"
if os.getenv('ENV') == 'local':
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

    # Load Meta-llama model and tokenizer once for efficiency
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.model = AutoModel.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.eval()

    if torch.cuda.is_available():
        self.model.half()

    self.tokenizer.pad_token = self.tokenizer.eos_token

    self.executor = ThreadPoolExecutor()

def preprocess_text(self, text):
    return text.strip().lower() if text else ""

def get_embedding(self, text):
    inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.model.device)

    with torch.no_grad():
        outputs = self.model(**inputs)
    
    sentence_embedding = outputs.last_hidden_state.mean(dim=1)
    return sentence_embedding.cpu().numpy()

async def get_embedding_async(self, text):
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(self.executor, self.get_embedding, text)

def compute_similarity(self, embedding1, embedding2):
    return cosine_similarity(embedding1, embedding2)[0][0]

async def are_statements_similar(self, threshold=0.85):
    self.clear_memory()

    preprocessed_statement1 = self.preprocess_text(self.statement_one)
    preprocessed_statement2 = self.preprocess_text(self.statement_two)

    if not preprocessed_statement1 or not preprocessed_statement2:
        return {"status": "OUTREACH"}

    embedding1, embedding2 = await asyncio.gather(
        self.get_embedding_async(preprocessed_statement1),
        self.get_embedding_async(preprocessed_statement2)
    )

    similarity_score = self.compute_similarity(embedding1, embedding2)
    logging.info(f"similarity score: {similarity_score}")

    if similarity_score >= threshold:
        return {"status": "NOT_ELIGIBLE"}
    else:
        return {"status": "ELIGIBLE"}

def clear_memory(self):
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

def __del__(self):
    self.executor.shutdown(wait=True)

I am using Meta llama-8B-Instruct model , managed the memory fragmentation what i need to do more to handle the GPU usage .

GPU configration : 22 GB
Instance :g5xLarge (A100 type GPU)

meta-llama
/

Llama-3.1-8B-Instruct

CUDA out of memory

Clear GPU memory cache at the start

Set environment variable to manage memory fragmentation