import torch import torch.nn as nn from src.bert import BERT class CustomBERTModel(nn.Module): def __init__(self, vocab_size, output_dim, pre_trained_model_path): super(CustomBERTModel, self).__init__() hidden_size = 768 self.bert = BERT(vocab_size=vocab_size, hidden=hidden_size, n_layers=4, attn_heads=8, dropout=0.1) # Load the pre-trained model's state_dict checkpoint = torch.load(pre_trained_model_path, map_location=torch.device('cpu')) if isinstance(checkpoint, dict): self.bert.load_state_dict(checkpoint) else: raise TypeError(f"Expected state_dict, got {type(checkpoint)} instead.") # Fully connected layer with input size 768 (matching BERT hidden size) self.fc = nn.Linear(hidden_size, output_dim) def forward(self, sequence, segment_info): sequence = sequence.to(next(self.parameters()).device) segment_info = segment_info.to(sequence.device) x = self.bert(sequence, segment_info) print(f"BERT output shape: {x.shape}") cls_embeddings = x[:, 0] # Extract CLS token embeddings print(f"CLS Embeddings shape: {cls_embeddings.shape}") logits = self.fc(cls_embeddings) # Pass tensor of size (batch_size, 768) to the fully connected layer return logits