CUDA out of memory
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import asyncio
from concurrent.futures import ThreadPoolExecutor
import os
import gc
import logging
Clear GPU memory cache at the start
torch.cuda.empty_cache()
Set environment variable to manage memory fragmentation
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(name)
class SimilarityMatching:
def init(self, statement_one, statement_two):
self.statement_one = statement_one
self.statement_two = statement_two
model_name = "/mnt/improvehealth/Meta-Llama-3.1-8B-Instruct"
if os.getenv('ENV') == 'local':
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# Load Meta-llama model and tokenizer once for efficiency
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(model_name).to('cuda' if torch.cuda.is_available() else 'cpu')
self.model.eval()
if torch.cuda.is_available():
self.model.half()
self.tokenizer.pad_token = self.tokenizer.eos_token
self.executor = ThreadPoolExecutor()
def preprocess_text(self, text):
return text.strip().lower() if text else ""
def get_embedding(self, text):
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(self.model.device)
with torch.no_grad():
outputs = self.model(**inputs)
sentence_embedding = outputs.last_hidden_state.mean(dim=1)
return sentence_embedding.cpu().numpy()
async def get_embedding_async(self, text):
loop = asyncio.get_event_loop()
return await loop.run_in_executor(self.executor, self.get_embedding, text)
def compute_similarity(self, embedding1, embedding2):
return cosine_similarity(embedding1, embedding2)[0][0]
async def are_statements_similar(self, threshold=0.85):
self.clear_memory()
preprocessed_statement1 = self.preprocess_text(self.statement_one)
preprocessed_statement2 = self.preprocess_text(self.statement_two)
if not preprocessed_statement1 or not preprocessed_statement2:
return {"status": "OUTREACH"}
embedding1, embedding2 = await asyncio.gather(
self.get_embedding_async(preprocessed_statement1),
self.get_embedding_async(preprocessed_statement2)
)
similarity_score = self.compute_similarity(embedding1, embedding2)
logging.info(f"similarity score: {similarity_score}")
if similarity_score >= threshold:
return {"status": "NOT_ELIGIBLE"}
else:
return {"status": "ELIGIBLE"}
def clear_memory(self):
if torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect()
def __del__(self):
self.executor.shutdown(wait=True)
I am using Meta llama-8B-Instruct model , managed the memory fragmentation what i need to do more to handle the GPU usage .
GPU configration : 22 GB
Instance :g5xLarge (A100 type GPU)