Aleph-Alpha
/

Pharia-1-LLM-7B-control-hf

@@ -764,9 +764,28 @@ class PhariaForCausalLM(PhariaPreTrainedModel):
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         return CausalLMOutputWithPast(
-            loss=0.0,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,

         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
+        loss = 0.0
+        if self.training and labels is None:
+            raise ValueError(
+                "You have to specify the `labels` tensor when training the model."
+            )
+        if self.training and labels is not None:
+            # Shift logits and labels for causal language modeling
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = outputs['labels'][..., 1:].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
+            shift_labels = shift_labels.view(-1)
+            # Compute loss
+            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=1)  # Pad token ID for Pharia is 1
+            loss = loss_fct(shift_logits, shift_labels)
         return CausalLMOutputWithPast(
+            loss=loss,
             logits=logits,
             past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,