Refine model config and init.
This commit is contained in:
		
							parent
							
								
									8330cbb036
								
							
						
					
					
						commit
						05f17b1221
					
				|  | @ -3,6 +3,6 @@ __pycache__ | |||
| *.txt | ||||
| *.npy | ||||
| temp | ||||
| # lightning_logs | ||||
| lightning_logs | ||||
| 
 | ||||
| checkpoints | ||||
|  | @ -4,12 +4,12 @@ | |||
| # LICENSE file in the root directory of this source tree. | ||||
| 
 | ||||
| 
 | ||||
| class QWenConfig: | ||||
| class ModelConfig: | ||||
|     def __init__(self): | ||||
|         self.vocab_size = 4096 | ||||
|         self.hidden_size = 128  # 128 1024 2048  32 | ||||
|         self.num_hidden_layers = 6  # 6 12 24  3 | ||||
|         self.num_attention_heads = 8  # 8 8 16 | ||||
|         self.hidden_size = 1024 | ||||
|         self.num_hidden_layers = 24 | ||||
|         self.num_attention_heads = 16 | ||||
|         self.emb_dropout_prob = 0.0 | ||||
|         self.attn_dropout_prob = 0.0 | ||||
|         self.layer_norm_epsilon = 1e-6 | ||||
|  | @ -4,7 +4,7 @@ from modelscope import snapshot_download | |||
| 
 | ||||
| from modeling_wit import QWenLMHeadModel | ||||
| from modeling_wit import QwenRunner | ||||
| from configuration_qwen import QWenConfig | ||||
| from wit.configuration import ModelConfig | ||||
| from tokenization_qwen import QWenTokenizer | ||||
| 
 | ||||
| 
 | ||||
|  | @ -20,7 +20,7 @@ torch.cuda.manual_seed_all(seed) | |||
| model_dir = snapshot_download("qwen/Qwen-1_8B-Chat") | ||||
| # model_dir = "/home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat" | ||||
| 
 | ||||
| config = QWenConfig() | ||||
| config = ModelConfig() | ||||
| model = QWenLMHeadModel(config) | ||||
| 
 | ||||
| print(model) | ||||
|  |  | |||
										
											Binary file not shown.
										
									
								
							|  | @ -1,3 +1,37 @@ | |||
| model_dir: /home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat | ||||
| config: !!python/object:wit.configuration.ModelConfig | ||||
|   attn_dropout_prob: 0.0 | ||||
|   bf16: false | ||||
|   chat_format: chatml | ||||
|   do_sample: true | ||||
|   emb_dropout_prob: 0.0 | ||||
|   fp16: false | ||||
|   fp32: false | ||||
|   hidden_size: 128 | ||||
|   initializer_range: 0.02 | ||||
|   intermediate_size: 5504 | ||||
|   layer_norm_epsilon: 1.0e-06 | ||||
|   max_new_tokens: 512 | ||||
|   max_position_embeddings: 8192 | ||||
|   max_window_size: 6144 | ||||
|   model_max_length: 8192 | ||||
|   no_bias: true | ||||
|   num_attention_heads: 8 | ||||
|   num_hidden_layers: 6 | ||||
|   repetition_penalty: 1.1 | ||||
|   rotary_emb_base: 10000 | ||||
|   rotary_pct: 1.0 | ||||
|   scale_attn_weights: true | ||||
|   softmax_in_fp32: false | ||||
|   tie_word_embeddings: false | ||||
|   top_k: 0 | ||||
|   top_p: 0.8 | ||||
|   use_cache: true | ||||
|   use_cache_kernel: false | ||||
|   use_cache_quantization: false | ||||
|   use_dynamic_ntk: true | ||||
|   use_flash_attn: auto | ||||
|   use_logn_attn: true | ||||
|   vocab_size: 4096 | ||||
| learning_rate: 0.0001 | ||||
| pretrained_model_dir: null | ||||
| use_tril_attention_mask: null | ||||
|  |  | |||
										
											Binary file not shown.
										
									
								
							|  | @ -1,3 +0,0 @@ | |||
| model_dir: /home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat | ||||
| learning_rate: 0.0001 | ||||
| use_tril_attention_mask: null | ||||
										
											Binary file not shown.
										
									
								
							|  | @ -1,3 +0,0 @@ | |||
| model_dir: /home/colin/.cache/modelscope/hub/qwen/Qwen-1_8B-Chat | ||||
| learning_rate: 0.0001 | ||||
| use_tril_attention_mask: null | ||||
|  | @ -6,23 +6,27 @@ import torch | |||
| import torchmetrics | ||||
| 
 | ||||
| from modeling_wit import QWenLMHeadModel | ||||
| from configuration_qwen import QWenConfig | ||||
| from wit.configuration import ModelConfig | ||||
| 
 | ||||
| from transformers import AutoConfig | ||||
| from modelscope import snapshot_download | ||||
| 
 | ||||
| 
 | ||||
| class LitModule(pl.LightningModule): | ||||
|     def __init__( | ||||
|         self, | ||||
|         model_dir: str, | ||||
|         pretrained_model_dir: str = None, | ||||
|         learning_rate: float = 0.0001, | ||||
|         config: ModelConfig = None, | ||||
|         use_tril_attention_mask: str = False, | ||||
|     ): | ||||
|         super().__init__() | ||||
|         self.save_hyperparameters() | ||||
|         config = QWenConfig() | ||||
|         if config == None: | ||||
|             config = ModelConfig() | ||||
|         model = QWenLMHeadModel(config) | ||||
|         model = model.from_pretrained(model_dir) | ||||
|         if pretrained_model_dir != None: | ||||
|             model = model.from_pretrained(snapshot_download(pretrained_model_dir)) | ||||
|         self.llm = self.register_core_module(model) | ||||
|         self.learning_rate = learning_rate | ||||
|         self.use_tril_attention_mask = use_tril_attention_mask | ||||
|  |  | |||
|  | @ -115,7 +115,7 @@ class MeaningDataset(Dataset): | |||
| 
 | ||||
|     def __getitem__(self, idx): | ||||
|         output = {} | ||||
|         data = torch.tensor(self.data[idx]) | ||||
|         data = torch.tensor(self.data[idx]).long() | ||||
|         output["input_ids"] = data | ||||
|         output["labels"] = data.clone() | ||||
|         output["token_type_ids"] = torch.zeros(data.shape) | ||||
|  |  | |||
|  | @ -20,8 +20,8 @@ class SpecialDataset(Dataset): | |||
|         z = torch.zeros([size]).long() | ||||
|         # self.data = torch.stack([a, b, a + b, a + b, a + b * 2]).permute(1, 0) | ||||
|         # self.data = torch.stack([a, b, a, a + b / 4]).permute(1, 0).long() | ||||
|         # self.data = torch.stack([a, a + 1, a + 2]).permute(1, 0).long() | ||||
|         self.data = torch.stack([a, b, a]).permute(1, 0).long() | ||||
|         self.data = torch.stack([a, a + 1, a + 2]).permute(1, 0).long() | ||||
|         # self.data = torch.stack([a, b, a]).permute(1, 0).long() | ||||
|         # self.data = torch.stack([a, b, a, a + a / 8, a + a / 4, a + a / 2, a + a]).permute(1, 0).long() | ||||
| 
 | ||||
|         # input  a b c | ||||
|  |  | |||
							
								
								
									
										29
									
								
								wit/train.py
								
								
								
								
							
							
						
						
									
										29
									
								
								wit/train.py
								
								
								
								
							|  | @ -14,21 +14,20 @@ from transformers import ( | |||
|     PreTrainedTokenizer, | ||||
|     set_seed, | ||||
| ) | ||||
| from modelscope import snapshot_download | ||||
| from lit_module import LitModule | ||||
| from tokenization_qwen import QWenTokenizer | ||||
| from logger import TBLogger | ||||
| 
 | ||||
| from special_dataset import SpecialDataset | ||||
| from meaning_dataset import MeaningDataset | ||||
| from wit.configuration import ModelConfig | ||||
| 
 | ||||
| model_name = "qwen/Qwen-1_8B-Chat" | ||||
| pretrain_model_name = None  # "qwen/Qwen-1_8B-Chat" | ||||
| learning_rate = 0.0001 | ||||
| use_tril_attention_mask = None | ||||
| precision = "32-true"  # "precision:bf16-mixed,16-mixed,32-true" | ||||
| tokenizer_name_or_path = None | ||||
| train_batch_size = 16 | ||||
| val_batch_size = 16 | ||||
| train_batch_size = 256 | ||||
| val_batch_size = 1 | ||||
| num_proc = 8 | ||||
| max_epochs = 1000 | ||||
| strategy = "auto" | ||||
|  | @ -38,21 +37,27 @@ vocab_size = 4096 | |||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     if tokenizer_name_or_path is None: | ||||
|         tokenizer_name_or_path = model_name | ||||
| 
 | ||||
|     set_seed(seed) | ||||
| 
 | ||||
|     model_dir = snapshot_download(model_name) | ||||
|     lit_module = LitModule(model_dir, learning_rate, use_tril_attention_mask) | ||||
|     config = ModelConfig() | ||||
|     config.vocab_size = vocab_size | ||||
|     config.hidden_size = 128  # 128 1024 2048  32 | ||||
|     config.num_hidden_layers = 6  # 6 12 24  3 | ||||
|     config.num_attention_heads = 8  # 8 8 16 | ||||
| 
 | ||||
|     lit_module = LitModule(pretrain_model_name, learning_rate, config, use_tril_attention_mask) | ||||
| 
 | ||||
|     tokenizer = QWenTokenizer("./wit_b64.tiktoken", "./wit_char.tiktoken") | ||||
| 
 | ||||
|     # raw_dataset = SpecialDataset() | ||||
|     raw_dataset = MeaningDataset(start=131072, end=1048576, size=32768) | ||||
|     raw_dataset = SpecialDataset() | ||||
|     # raw_dataset = MeaningDataset(start=65536, end=262133, size=32768, max_subitem=4, vocab_size=vocab_size) | ||||
|     train_dataset, val_dataset = random_split(raw_dataset, [0.95, 0.05]) | ||||
| 
 | ||||
|     # daf = next(iter(train_dataset))["input_ids"].numpy().tolist() | ||||
|     it = iter(train_dataset) | ||||
|     print("data samples:") | ||||
|     for i in range(10): | ||||
|         print(next(it)["input_ids"].numpy().tolist()) | ||||
| 
 | ||||
|     train_dataloader = DataLoader( | ||||
|         train_dataset, | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue