runningSnail commited on
Commit
abd40c7
1 Parent(s): a33ba53

support model registration

Browse files
Files changed (5) hide show
  1. README.md +18 -7
  2. __init__.py +0 -0
  3. inference_example.py +65 -0
  4. inference_example2.py +62 -0
  5. modeling_dolphin.py +1 -1
README.md CHANGED
@@ -42,11 +42,24 @@ Dolphin employs a decoder-decoder framework with two main components:
42
  ![Model Architecture](modelstructure.jpg)
43
 
44
  ## Running the Model
 
 
 
 
 
 
45
 
 
 
 
 
 
46
  ```python
47
- from transformers import AutoTokenizer
48
- from configuration_dolphin import DolphinForCausalLM
49
- import time
 
 
50
 
51
  def inference_instruct(mycontext, question, device="cuda:0"):
52
  import time
@@ -90,13 +103,11 @@ def inference_instruct(mycontext, question, device="cuda:0"):
90
 
91
 
92
  if __name__ == "__main__":
93
- # Register your configuration and model
94
  AutoConfig.register("dolphin", DolphinConfig)
95
  AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
96
- device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
97
-
98
  # Load the tokenizer and model
99
- tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
100
  model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
101
 
102
  # Run inference example
 
42
  ![Model Architecture](modelstructure.jpg)
43
 
44
  ## Running the Model
45
+ Method 1 : download this repository and run the following commands:
46
+ ```bash
47
+ git lfs install
48
+ git clone https://huggingface.co/NexaAIDev/Dolphin
49
+ python inference_example.py
50
+ ```
51
 
52
+ Method 2 : install `dolphin` package
53
+ ```
54
+ pip install nexaai-dolphin
55
+ ```
56
+ Then run the following commands:
57
  ```python
58
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
59
+ import torch
60
+ from dolphin.configuration_dolphin import DolphinConfig
61
+ from dolphin.modeling_dolphin import DolphinForCausalLM
62
+
63
 
64
  def inference_instruct(mycontext, question, device="cuda:0"):
65
  import time
 
103
 
104
 
105
  if __name__ == "__main__":
106
+ device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
107
  AutoConfig.register("dolphin", DolphinConfig)
108
  AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
 
 
109
  # Load the tokenizer and model
110
+ tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin')
111
  model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
112
 
113
  # Run inference example
__init__.py ADDED
File without changes
inference_example.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
4
+ from configuration_dolphin import DolphinConfig
5
+ from modeling_dolphin import DolphinForCausalLM
6
+ from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoConfig)
7
+ import torch
8
+
9
+ def inference_instruct(mycontext, question, device="cuda:0"):
10
+ import time
11
+ MEMORY_SIZE = 32
12
+ start_time = time.time()
13
+ generated_token_ids = []
14
+ prompt = f" <context>{question}"
15
+ text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
16
+ input_ids = (
17
+ torch.tensor(
18
+ text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
19
+ )
20
+ .unsqueeze(0)
21
+ .to(device)
22
+ )
23
+ # to process the context
24
+ context_tokenized = tokenizer(
25
+ mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
26
+ return_tensors="pt",
27
+ )
28
+ context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
29
+ context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
30
+ # We conduct a inference process
31
+ for i in range(context_token_count):
32
+ next_token = (
33
+ model(
34
+ input_ids,
35
+ context_input_ids=context_tokenized["input_ids"],
36
+ context_attention_mask=context_tokenized["attention_mask"],
37
+ )
38
+ .logits[:, -1]
39
+ .argmax(-1)
40
+ )
41
+ if next_token.item() == 151643:
42
+ break
43
+ generated_token_ids.append(next_token.item())
44
+ input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
45
+ result = tokenizer.decode(generated_token_ids)
46
+ print(f"Time taken: {time.time() - start_time}")
47
+ return result
48
+
49
+
50
+ if __name__ == "__main__":
51
+ # Register your configuration and model
52
+ AutoConfig.register("dolphin", DolphinConfig)
53
+ AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
54
+ device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
55
+
56
+ # Load the tokenizer and model
57
+ tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
58
+ model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
59
+
60
+ # Run inference example
61
+ mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
62
+ question = "Who founded Nexa AI?"
63
+ # Pass the context and the correct device string
64
+ result = inference_instruct(mycontext, question, device=device_name)
65
+ print("Result:", result)
inference_example2.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
2
+ import torch
3
+ # !pip install nexaai-dolphin
4
+ from dolphin.configuration_dolphin import DolphinConfig
5
+ from dolphin.modeling_dolphin import DolphinForCausalLM
6
+
7
+
8
+ def inference_instruct(mycontext, question, device="cuda:0"):
9
+ import time
10
+ MEMORY_SIZE = 32
11
+ start_time = time.time()
12
+ generated_token_ids = []
13
+ prompt = f" <context>{question}"
14
+ text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
15
+ input_ids = (
16
+ torch.tensor(
17
+ text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
18
+ )
19
+ .unsqueeze(0)
20
+ .to(device)
21
+ )
22
+ # to process the context
23
+ context_tokenized = tokenizer(
24
+ mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
25
+ return_tensors="pt",
26
+ )
27
+ context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
28
+ context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
29
+ # We conduct a inference process
30
+ for i in range(context_token_count):
31
+ next_token = (
32
+ model(
33
+ input_ids,
34
+ context_input_ids=context_tokenized["input_ids"],
35
+ context_attention_mask=context_tokenized["attention_mask"],
36
+ )
37
+ .logits[:, -1]
38
+ .argmax(-1)
39
+ )
40
+ if next_token.item() == 151643:
41
+ break
42
+ generated_token_ids.append(next_token.item())
43
+ input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
44
+ result = tokenizer.decode(generated_token_ids)
45
+ print(f"Time taken: {time.time() - start_time}")
46
+ return result
47
+
48
+
49
+ if __name__ == "__main__":
50
+ device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
51
+ AutoConfig.register("dolphin", DolphinConfig)
52
+ AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
53
+ # Load the tokenizer and model
54
+ tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin')
55
+ model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name)
56
+
57
+ # Run inference example
58
+ mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
59
+ question = "Who founded Nexa AI?"
60
+ # Pass the context and the correct device string
61
+ result = inference_instruct(mycontext, question, device=device_name)
62
+ print("Result:", result)
modeling_dolphin.py CHANGED
@@ -22,7 +22,7 @@ from typing import List, Optional, Tuple, Union
22
  import warnings
23
  from dataclasses import dataclass
24
  from torch.nn import CrossEntropyLoss
25
- from .configuration_dolphin import encoder_config_dict, DolphinConfig
26
 
27
  CONTEXT_EMB = 896 # Qwen 0.7B has dimension of 896
28
  HIDDEN_EMB = 3584 # Qwen 7B has dimension of 3584
 
22
  import warnings
23
  from dataclasses import dataclass
24
  from torch.nn import CrossEntropyLoss
25
+ from configuration_dolphin import encoder_config_dict, DolphinConfig
26
 
27
  CONTEXT_EMB = 896 # Qwen 0.7B has dimension of 896
28
  HIDDEN_EMB = 3584 # Qwen 7B has dimension of 3584