|
import json
|
|
from copy import deepcopy
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Literal, Optional, Type, Union
|
|
|
|
import torch
|
|
from typing_extensions import Self
|
|
|
|
import tsai_gpt.model
|
|
from tsai_gpt.utils import find_multiple
|
|
|
|
|
|
@dataclass
|
|
class Config:
|
|
name: str = ""
|
|
hf_config: dict = field(default_factory=dict)
|
|
block_size: int = 4096
|
|
vocab_size: int = 50254
|
|
padding_multiple: int = 512
|
|
padded_vocab_size: Optional[int] = None
|
|
n_layer: int = 16
|
|
n_head: int = 32
|
|
n_embd: int = 4096
|
|
rotary_percentage: float = 0.25
|
|
parallel_residual: bool = True
|
|
bias: bool = True
|
|
lm_head_bias: bool = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n_query_groups: Optional[int] = None
|
|
shared_attention_norm: bool = False
|
|
_norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
|
|
norm_eps: float = 1e-5
|
|
_mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
|
|
gelu_approximate: str = "none"
|
|
intermediate_size: Optional[int] = None
|
|
rope_condense_ratio: int = 1
|
|
rope_base: int = 10000
|
|
|
|
def __post_init__(self):
|
|
if not self.name:
|
|
self.name = self.hf_config.get("name", self.name)
|
|
|
|
assert self.n_embd % self.n_head == 0
|
|
self.head_size = self.n_embd // self.n_head
|
|
|
|
|
|
if self.padded_vocab_size is None:
|
|
self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
|
|
else:
|
|
|
|
self.vocab_size = min(self.vocab_size, self.padded_vocab_size)
|
|
|
|
|
|
if self.n_query_groups is not None:
|
|
assert self.n_head % self.n_query_groups == 0
|
|
else:
|
|
self.n_query_groups = self.n_head
|
|
|
|
|
|
if self.intermediate_size is None:
|
|
if self._mlp_class == "LLaMAMLP":
|
|
raise ValueError("The config needs to set the `intermediate_size`")
|
|
self.intermediate_size = 4 * self.n_embd
|
|
|
|
self.rope_n_elem = int(self.rotary_percentage * self.head_size)
|
|
|
|
@classmethod
|
|
def from_name(cls, name: str, **kwargs: Any) -> Self:
|
|
if name not in name_to_config:
|
|
|
|
conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
|
|
else:
|
|
conf_dict = name_to_config[name]
|
|
|
|
conf_dict = conf_dict.copy()
|
|
if "condense_ratio" in kwargs:
|
|
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
|
|
conf_dict.update(kwargs)
|
|
return cls(**conf_dict)
|
|
|
|
@classmethod
|
|
def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
|
|
with open(path, encoding="utf-8") as fp:
|
|
json_kwargs = json.load(fp)
|
|
if "condense_ratio" in json_kwargs:
|
|
json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
|
|
if "condense_ratio" in kwargs:
|
|
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
|
|
if "org" in json_kwargs:
|
|
json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
|
|
if "org" in kwargs:
|
|
kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
|
|
json_kwargs.update(kwargs)
|
|
return cls(**json_kwargs)
|
|
|
|
@property
|
|
def mlp_class(self) -> Type:
|
|
|
|
return getattr(tsai_gpt.model, self._mlp_class)
|
|
|
|
@property
|
|
def norm_class(self) -> Type:
|
|
|
|
if self._norm_class == "RMSNorm":
|
|
from tsai_gpt.rmsnorm import RMSNorm
|
|
|
|
return RMSNorm
|
|
return getattr(torch.nn, self._norm_class)
|
|
|
|
|
|
|
|
|
|
|
|
configs = [
|
|
|
|
dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
|
|
|
|
dict(
|
|
name="stablelm-base-alpha-7b",
|
|
hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
|
|
n_head=48,
|
|
n_embd=6144,
|
|
padding_multiple=256,
|
|
),
|
|
|
|
dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
|
|
|
|
dict(
|
|
name="stablelm-tuned-alpha-7b",
|
|
hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
|
|
n_head=48,
|
|
n_embd=6144,
|
|
padding_multiple=256,
|
|
),
|
|
]
|
|
|
|
|
|
|
|
|
|
pythia = [
|
|
|
|
dict(
|
|
name="pythia-70m",
|
|
hf_config=dict(org="EleutherAI", name="pythia-70m"),
|
|
block_size=2048,
|
|
n_layer=6,
|
|
n_embd=512,
|
|
n_head=8,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-160m",
|
|
hf_config=dict(org="EleutherAI", name="pythia-160m"),
|
|
block_size=2048,
|
|
n_layer=12,
|
|
n_embd=768,
|
|
n_head=12,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-410m",
|
|
hf_config=dict(org="EleutherAI", name="pythia-410m"),
|
|
block_size=2048,
|
|
n_layer=24,
|
|
n_embd=1024,
|
|
n_head=16,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-1b",
|
|
hf_config=dict(org="EleutherAI", name="pythia-1b"),
|
|
block_size=2048,
|
|
n_embd=2048,
|
|
n_head=8,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-1.4b",
|
|
hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
|
|
block_size=2048,
|
|
n_layer=24,
|
|
n_embd=2048,
|
|
n_head=16,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-2.8b",
|
|
hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
|
|
block_size=2048,
|
|
n_layer=32,
|
|
n_embd=2560,
|
|
padding_multiple=128,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-6.9b",
|
|
hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
|
|
block_size=2048,
|
|
n_layer=32,
|
|
padding_multiple=256,
|
|
),
|
|
|
|
dict(
|
|
name="pythia-12b",
|
|
hf_config=dict(org="EleutherAI", name="pythia-12b"),
|
|
block_size=2048,
|
|
n_layer=36,
|
|
n_embd=5120,
|
|
n_head=40,
|
|
),
|
|
]
|
|
configs.extend(pythia)
|
|
for c in pythia:
|
|
copy = c.copy()
|
|
copy["name"] = f"{c['name']}-deduped"
|
|
copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
|
|
configs.append(copy)
|
|
|
|
|
|
|
|
|
|
|
|
redpajama_incite = [
|
|
|
|
dict(
|
|
name="RedPajama-INCITE-{}-3B-v1",
|
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
|
|
block_size=2048,
|
|
n_layer=32,
|
|
n_embd=2560,
|
|
padding_multiple=256,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
),
|
|
|
|
dict(
|
|
name="RedPajama-INCITE-7B-{}",
|
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
|
|
block_size=2048,
|
|
n_layer=32,
|
|
padding_multiple=256,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
),
|
|
|
|
dict(
|
|
name="RedPajama-INCITE-{}-7B-v0.1",
|
|
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
|
|
block_size=2048,
|
|
n_layer=32,
|
|
padding_multiple=256,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
),
|
|
]
|
|
for c in redpajama_incite:
|
|
for kind in ("Base", "Chat", "Instruct"):
|
|
copy = c.copy()
|
|
copy["name"] = c["name"].format(kind)
|
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
|
|
configs.append(copy)
|
|
|
|
|
|
|
|
|
|
|
|
falcon = [
|
|
|
|
dict(
|
|
name="falcon-7b{}",
|
|
hf_config=dict(org="tiiuae", name="falcon-7b{}"),
|
|
block_size=2048,
|
|
vocab_size=65024,
|
|
padded_vocab_size=65024,
|
|
n_layer=32,
|
|
n_head=71,
|
|
n_embd=4544,
|
|
rotary_percentage=1.0,
|
|
n_query_groups=1,
|
|
bias=False,
|
|
|
|
shared_attention_norm=True,
|
|
),
|
|
|
|
dict(
|
|
name="falcon-40b{}",
|
|
hf_config=dict(org="tiiuae", name="falcon-40b{}"),
|
|
block_size=2048,
|
|
vocab_size=65024,
|
|
padded_vocab_size=65024,
|
|
n_layer=60,
|
|
n_head=128,
|
|
n_embd=8192,
|
|
rotary_percentage=1.0,
|
|
n_query_groups=8,
|
|
bias=False,
|
|
),
|
|
]
|
|
for c in falcon:
|
|
for kind in ("", "-instruct"):
|
|
copy = c.copy()
|
|
copy["name"] = c["name"].format(kind)
|
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
|
|
configs.append(copy)
|
|
|
|
|
|
falcon180b = dict(
|
|
name="falcon-180B{}",
|
|
hf_config=dict(org="tiiuae", name="falcon-180B{}"),
|
|
block_size=2048,
|
|
vocab_size=65024,
|
|
padded_vocab_size=65024,
|
|
n_layer=80,
|
|
n_head=232,
|
|
n_embd=14848,
|
|
rotary_percentage=1.0,
|
|
n_query_groups=8,
|
|
bias=False,
|
|
)
|
|
|
|
for kind in ("", "-chat"):
|
|
copy = falcon180b.copy()
|
|
copy["name"] = falcon180b["name"].format(kind)
|
|
copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
|
|
configs.append(copy)
|
|
|
|
|
|
|
|
|
|
|
|
open_LLaMA = [
|
|
|
|
dict(
|
|
name="open_llama_3b",
|
|
hf_config=dict(org="openlm-research", name="open_llama_3b"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=26,
|
|
n_embd=3200,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=8640,
|
|
),
|
|
|
|
dict(
|
|
name="open_llama_7b",
|
|
hf_config=dict(org="openlm-research", name="open_llama_7b"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="open_llama_13b",
|
|
hf_config=dict(org="openlm-research", name="open_llama_13b"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
]
|
|
configs.extend(open_LLaMA)
|
|
|
|
|
|
|
|
|
|
|
|
vicuna = [
|
|
|
|
dict(
|
|
name="vicuna-7b-v1.3",
|
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-13b-v1.3",
|
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-33b-v1.3",
|
|
hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=60,
|
|
n_head=52,
|
|
n_embd=6656,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=17920,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-7b-v1.5",
|
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-7b-v1.5-16k",
|
|
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_condense_ratio=4,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-13b-v1.5",
|
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="vicuna-13b-v1.5-16k",
|
|
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
rope_condense_ratio=4,
|
|
),
|
|
]
|
|
configs.extend(vicuna)
|
|
|
|
|
|
|
|
|
|
|
|
long_chat = [
|
|
|
|
dict(
|
|
name="longchat-7b-16k",
|
|
hf_config=dict(org="lmsys", name="longchat-7b-16k"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_condense_ratio=8,
|
|
),
|
|
|
|
dict(
|
|
name="longchat-13b-16k",
|
|
hf_config=dict(org="lmsys", name="longchat-13b-16k"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
rope_condense_ratio=8,
|
|
),
|
|
]
|
|
configs.extend(long_chat)
|
|
|
|
|
|
|
|
|
|
|
|
nous_research = [
|
|
|
|
dict(
|
|
name="Nous-Hermes-llama-2-7b",
|
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
|
|
padded_vocab_size=32000,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="Nous-Hermes-13b",
|
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padded_vocab_size=32001,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-6,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="Nous-Hermes-Llama2-13b",
|
|
hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
|
|
vocab_size=32000,
|
|
padded_vocab_size=32032,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
]
|
|
configs.extend(nous_research)
|
|
|
|
|
|
|
|
|
|
|
|
llama_2 = [
|
|
|
|
dict(
|
|
name="Llama-2-7b{}-hf",
|
|
hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="Llama-2-13b{}-hf",
|
|
hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="Llama-2-70b{}-hf",
|
|
hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=80,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=28672,
|
|
),
|
|
]
|
|
for c in llama_2:
|
|
for kind in ("", "-chat"):
|
|
copy = c.copy()
|
|
copy["name"] = c["name"].format(kind)
|
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
|
|
configs.append(copy)
|
|
|
|
|
|
|
|
|
|
|
|
freewilly_2 = [
|
|
|
|
dict(
|
|
name="FreeWilly2",
|
|
hf_config=dict(org="stabilityai", name="FreeWilly2"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=80,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=28672,
|
|
)
|
|
]
|
|
configs.extend(freewilly_2)
|
|
|
|
|
|
|
|
|
|
|
|
code_llama = [
|
|
|
|
dict(
|
|
name="CodeLlama-7b-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
|
|
block_size=16384,
|
|
vocab_size=32016,
|
|
padding_multiple=16,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-13b-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
|
|
block_size=16384,
|
|
vocab_size=32016,
|
|
padding_multiple=16,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-34b-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=48,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=22016,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-7b-Python-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-13b-Python-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-34b-Python-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=48,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=22016,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-7b-Instruct-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
|
|
block_size=16384,
|
|
vocab_size=32016,
|
|
padding_multiple=16,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-13b-Instruct-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
|
|
block_size=2048,
|
|
vocab_size=32016,
|
|
padding_multiple=16,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
rope_base=1000000,
|
|
),
|
|
|
|
dict(
|
|
name="CodeLlama-34b-Instruct-hf",
|
|
hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
|
|
block_size=16384,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=48,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=22016,
|
|
rope_base=1000000,
|
|
),
|
|
]
|
|
configs.extend(code_llama)
|
|
|
|
|
|
|
|
|
|
|
|
platypus = [
|
|
|
|
dict(
|
|
name="Platypus-30B",
|
|
hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
|
|
block_size=2048,
|
|
padded_vocab_size=32000,
|
|
n_layer=60,
|
|
n_head=52,
|
|
n_embd=6656,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-06,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=17920,
|
|
),
|
|
|
|
dict(
|
|
name="Platypus2-7B",
|
|
hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
),
|
|
|
|
dict(
|
|
name="Platypus2-13B",
|
|
hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="Platypus2-70B",
|
|
hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=80,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=28672,
|
|
),
|
|
|
|
dict(
|
|
name="Camel-Platypus2-13B",
|
|
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="Camel-Platypus2-70B",
|
|
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=80,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=28672,
|
|
),
|
|
|
|
dict(
|
|
name="Stable-Platypus2-13B",
|
|
hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
|
|
padded_vocab_size=32000,
|
|
n_layer=40,
|
|
n_head=40,
|
|
n_embd=5120,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=13824,
|
|
),
|
|
|
|
dict(
|
|
name="Platypus2-70B-instruct",
|
|
hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
|
|
padded_vocab_size=32000,
|
|
n_layer=80,
|
|
n_head=64,
|
|
n_embd=8192,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=28672,
|
|
),
|
|
]
|
|
configs.extend(platypus)
|
|
|
|
|
|
|
|
|
|
|
|
stablecode = [
|
|
|
|
dict(
|
|
name="stablecode-completion-alpha-3b",
|
|
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
|
|
block_size=16384,
|
|
vocab_size=49152,
|
|
n_layer=32,
|
|
n_embd=2560,
|
|
),
|
|
|
|
dict(
|
|
name="stablecode-completion-alpha-3b-4k",
|
|
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
|
|
vocab_size=49152,
|
|
n_layer=32,
|
|
n_embd=2560,
|
|
),
|
|
|
|
dict(
|
|
name="stablecode-instruct-alpha-3b",
|
|
hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
|
|
vocab_size=49152,
|
|
n_layer=32,
|
|
n_embd=2560,
|
|
),
|
|
]
|
|
configs.extend(stablecode)
|
|
|
|
|
|
|
|
|
|
|
|
together_llama2_32k = [
|
|
|
|
dict(
|
|
name="LLaMA-2-7B-32K",
|
|
hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=32,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=11008,
|
|
rope_condense_ratio=8,
|
|
)
|
|
]
|
|
configs.extend(together_llama2_32k)
|
|
|
|
|
|
|
|
|
|
|
|
phi = [
|
|
|
|
dict(
|
|
name="phi-1_5",
|
|
hf_config=dict(org="microsoft", name="phi-1_5"),
|
|
vocab_size=50257,
|
|
padded_vocab_size=51200,
|
|
block_size=2048,
|
|
n_embd=2048,
|
|
n_layer=24,
|
|
rotary_percentage=0.5,
|
|
shared_attention_norm=True,
|
|
lm_head_bias=True,
|
|
gelu_approximate="tanh",
|
|
)
|
|
]
|
|
configs.extend(phi)
|
|
|
|
|
|
|
|
|
|
|
|
mistral = [
|
|
|
|
dict(
|
|
name="Mistral-7B-{}v0.1",
|
|
hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
|
|
padded_vocab_size=32000,
|
|
block_size=4096,
|
|
n_layer=32,
|
|
n_query_groups=8,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-05,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=14336,
|
|
)
|
|
]
|
|
for c in mistral:
|
|
for kind in ("", "Instruct-"):
|
|
copy = c.copy()
|
|
copy["name"] = c["name"].format(kind)
|
|
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
|
|
configs.append(copy)
|
|
|
|
|
|
|
|
|
|
|
|
tiny_llama = [
|
|
dict(
|
|
name="tiny-llama-1.1b",
|
|
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
|
|
block_size=2048,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=22,
|
|
n_head=32,
|
|
n_embd=2048,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-5,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=5632,
|
|
n_query_groups=4,
|
|
),
|
|
dict(
|
|
name="tiny-llama-new",
|
|
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
|
|
block_size=768,
|
|
vocab_size=32000,
|
|
padding_multiple=64,
|
|
n_layer=18,
|
|
n_head=32,
|
|
n_embd=1024,
|
|
rotary_percentage=1.0,
|
|
parallel_residual=False,
|
|
bias=False,
|
|
_norm_class="RMSNorm",
|
|
norm_eps=1e-5,
|
|
_mlp_class="LLaMAMLP",
|
|
intermediate_size=5632,
|
|
n_query_groups=4,
|
|
),
|
|
]
|
|
configs.extend(tiny_llama)
|
|
|
|
|
|
name_to_config = {config["name"]: config for config in configs} |