jweb commited on
Commit
101d6a0
1 Parent(s): ff2242a

second commit

Browse files
README.md CHANGED
@@ -1,3 +1,143 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ thumbnail: https://github.com/ycat3/japanese-pretrained-models/blob/master/jweb.png
4
+ tags:
5
+ - ja
6
+ - japanese
7
+ - gpt2
8
+ - text-generation
9
+ - lm
10
+ - nlp
11
+ - rust
12
+ - rust-bert
13
+
14
+ license: mit
15
+
16
+ datasets:
17
+ - cc100
18
+ - wikipedia
19
+ - AozoraBunko
20
+
21
+ widget:
22
+ - text: "夏目漱石は、"
23
+ ---
24
+
25
+ # japanese-soseki-gpt2-1b
26
+
27
+ ![jweb-icon](./jweb.png)
28
+
29
+ This repository provides a 1.3B-parameter finetuned Japanese GPT2 model.
30
+ The model was finetuned by [jweb](https://jweb.asia/) based on trained by [rinna Co., Ltd.](https://corp.rinna.co.jp/)
31
+ Both pytorch(pytorch_model.bin) and Rust(rust_model.ot) models are provided
32
+
33
+ # How to use the model
34
+
35
+ *NOTE:* Use `T5Tokenizer` to initiate the tokenizer.
36
+
37
+ python
38
+ ~~~~
39
+ import torch
40
+ from transformers import T5Tokenizer, AutoModelForCausalLM
41
+
42
+ tokenizer = T5Tokenizer.from_pretrained("jweb/japanese-soseki-gpt2-1b")
43
+ model = AutoModelForCausalLM.from_pretrained("jweb/japanese-soseki-gpt2-1b")
44
+
45
+ if torch.cuda.is_available():
46
+ model = model.to("cuda")
47
+
48
+ text = "夏目漱石は、"
49
+ token_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")
50
+
51
+ with torch.no_grad():
52
+ output_ids = model.generate(
53
+ token_ids.to(model.device),
54
+ max_length=128,
55
+ min_length=40,
56
+ do_sample=True,
57
+ repetition_penalty= 1.6,
58
+ early_stopping= True,
59
+ num_beams= 5,
60
+ temperature= 1.0,
61
+ top_k=500,
62
+ top_p=0.95,
63
+ pad_token_id=tokenizer.pad_token_id,
64
+ bos_token_id=tokenizer.bos_token_id,
65
+ eos_token_id=tokenizer.eos_token_id,
66
+ )
67
+
68
+ output = tokenizer.decode(output_ids.tolist()[0])
69
+ print(output)
70
+ # sample output: 夏目漱石は、明治時代を代表する文豪です。夏目漱石の代表作は「吾輩は猫である」や「坊っちゃん」、「草枕」「三四郎」、それに「虞美人草(ぐびじんそう)」などたくさんあります。
71
+ ~~~~
72
+
73
+ rust
74
+ ~~~~
75
+ use rust_bert::gpt2::GPT2Generator;
76
+ use rust_bert::pipelines::common::{ModelType, TokenizerOption};
77
+ use rust_bert::pipelines::generation_utils::{GenerateConfig, LanguageGenerator};
78
+ use rust_bert::resources::{ RemoteResource, ResourceProvider};
79
+ use tch::Device;
80
+
81
+ fn main() -> anyhow::Result<()> {
82
+ let model_resource = Box::new(RemoteResource {
83
+ url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/rust_model.ot".into(),
84
+ cache_subdir: "japanese-soseki-gpt2-1b/model".into(),
85
+ });
86
+ let config_resource = Box::new(RemoteResource {
87
+ url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/config.json".into(),
88
+ cache_subdir: "japanese-soseki-gpt2-1b/config".into(),
89
+ });
90
+ let vocab_resource = Box::new(RemoteResource {
91
+ url: "https://huggingface.co/jweb/japanese-soseki-gpt2-1b/resolve/main/spiece.model".into(),
92
+ cache_subdir: "japanese-soseki-gpt2-1b/vocab".into(),
93
+ });
94
+ let vocab_resource_token = vocab_resource.clone();
95
+ let merges_resource = vocab_resource.clone();
96
+ let generate_config = GenerateConfig {
97
+ model_resource,
98
+ config_resource,
99
+ vocab_resource,
100
+ merges_resource, // not used
101
+ device: Device::Cpu,
102
+ repetition_penalty: 1.6,
103
+ min_length: 40,
104
+ max_length: 128,
105
+ do_sample: true,
106
+ early_stopping: true,
107
+ num_beams: 5,
108
+ temperature: 1.0,
109
+ top_k: 500,
110
+ top_p: 0.95,
111
+ ..Default::default()
112
+ };
113
+ let tokenizer = TokenizerOption::from_file(
114
+ ModelType::T5,
115
+ vocab_resource_token.get_local_path().unwrap().to_str().unwrap(),
116
+ None,
117
+ true,
118
+ None,
119
+ None,
120
+ )?;
121
+ let mut gpt2_model = GPT2Generator::new_with_tokenizer(generate_config, tokenizer.into())?;
122
+ gpt2_model.set_device(Device::cuda_if_available());
123
+ let input_text = "夏目漱石は、";
124
+ let t1 = std::time::Instant::now();
125
+ let output = gpt2_model.generate(Some(&[input_text]), None);
126
+ println!("{}", output[0].text);
127
+ println!("Elapsed Time(ms):{}",t1.elapsed().as_millis());
128
+ Ok(())
129
+ }
130
+ // sample output: 夏目漱石は、明治から大正にかけて活躍した日本の小説家です。彼は「吾輩は猫である」や「坊っちゃん」、「草枕」「三四郎」、あるいは「虞美人草」などの小説で知られていますが、「明暗」のような小説も書いていました。
131
+ ~~~~
132
+
133
+ # Model architecture
134
+ A 24-layer, 2048-hidden-size transformer-based language model.
135
+
136
+ # Training
137
+ The model was trained on [Japanese C4](https://huggingface.co/datasets/allenai/c4), [Japanese CC-100](http://data.statmt.org/cc-100/ja.txt.xz) and [Japanese Wikipedia](https://dumps.wikimedia.org/other/cirrussearch) to optimize a traditional language modelling objective. It reaches around 14 perplexity on a chosen validation set from the same data.
138
+ # Finetuning
139
+ The model was finetuned on [Aozorabunko](https://github.com/aozorabunko/aozorabunko), especially Natume Soseki books.
140
+ # Tokenization
141
+ The model uses a [sentencepiece](https://github.com/google/sentencepiece)-based tokenizer. The vocabulary was first trained on a selected subset from the training data using the official sentencepiece training script, and then augmented with emojis and symbols.
142
+ # Licenese
143
+ [The MIT license](https://opensource.org/licenses/MIT)
tokenizer.json/added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "[CLS]": 44878,
3
+ "[MASK]": 44879,
4
+ "[PAD]": 44877,
5
+ "[SEP]": 44876
6
+ }
tokenizer.json/special_tokens_map.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "</s>",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": "<unk>"
9
+ }
tokenizer.json/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d955d3e358f66e9e8320bd834524b1264c21cc66a68fb18f3a4f091ed25a5c40
3
+ size 1044749
tokenizer.json/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [],
3
+ "bos_token": "<s>",
4
+ "cls_token": "[CLS]",
5
+ "do_lower_case": false,
6
+ "eos_token": "</s>",
7
+ "extra_ids": 0,
8
+ "mask_token": "[MASK]",
9
+ "model_max_length": 1000000000000000019884624838656,
10
+ "pad_token": "[PAD]",
11
+ "sep_token": "[SEP]",
12
+ "sp_model_kwargs": {},
13
+ "special_tokens_map_file": "/home/mycat/model/rinna/rinna-gpt/special_tokens_map.json",
14
+ "tokenizer_class": "T5Tokenizer",
15
+ "unk_token": "<unk>"
16
+ }