Compare commits

..

2 Commits

Author SHA1 Message Date
Colin f96bcc799c Refine model of qwen for long sequence in eval. 2024-01-19 14:54:48 +08:00
Colin 45c2f532ff Add mem_tracker in tools. 2024-01-19 14:52:28 +08:00
6 changed files with 197 additions and 34 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
__pycache__ __pycache__
.vscode .vscode
*.txt

View File

@ -52,11 +52,10 @@ print(model)
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = model.from_pretrained( model = model.from_pretrained(model_dir, config=config, device_map="auto", trust_remote_code=True)
model_dir, config=config, device_map="auto", trust_remote_code=True
).train() # model = model.eval()
# model.train() model = model.train() # control by @torch.no_grad()
# model.zero_grad()
# 可指定不同的生成长度、top_p等相关超参 # 可指定不同的生成长度、top_p等相关超参
# model.generation_config = GenerationConfig.from_pretrained( # model.generation_config = GenerationConfig.from_pretrained(
@ -74,16 +73,14 @@ print(decode_tokens)
# 日本的首都东京。<|im_end|><|endoftext|> # 日本的首都东京。<|im_end|><|endoftext|>
# # 第一轮对话 # # 第一轮对话
# response, history, decode_tokens = model.chat(tokenizer, "你好", "", history=None) # response, history, decode_tokens = model.chat(tokenizer, "你好", "", history=None)
# print(decode_tokens) # print(decode_tokens)
# # 你好!很高兴为你提供帮助。 # # 你好!很高兴为你提供帮助。
# 第二轮对话 # 第二轮对话
# response, history = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", history=None) response, history, decode_tokens = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "", history=None)
# print(response) print(response)
# <|im_start|>system # <|im_start|>system

View File

@ -41,8 +41,10 @@ import sys
sys.path.append("..") sys.path.append("..")
from tools import show from tools import show
from tools import mem_tracker
logger = logging.get_logger(__name__) # tracker = mem_tracker.MemTracker()
# tracker.track()
class QWenAttention(nn.Module): class QWenAttention(nn.Module):
@ -110,8 +112,6 @@ class QWenAttention(nn.Module):
query = apply_rotary_pos_emb(query, q_pos_emb) query = apply_rotary_pos_emb(query, q_pos_emb)
key = apply_rotary_pos_emb(key, k_pos_emb) key = apply_rotary_pos_emb(key, k_pos_emb)
present = (key, value)
key_size = key.size(1) key_size = key.size(1)
if key_size > self.seq_length and not self.training: if key_size > self.seq_length and not self.training:
seq_start = key.size(1) - query.size(1) seq_start = key.size(1) - query.size(1)
@ -148,8 +148,8 @@ class QWenAttention(nn.Module):
attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2) attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2)
context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim) context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim)
attn_output = self.c_proj(context_layer) attn_output = self.c_proj(context_layer)
outputs = (attn_output, present)
return outputs return attn_output
class QWenMLP(nn.Module): class QWenMLP(nn.Module):
@ -199,7 +199,6 @@ class QWenBlock(nn.Module):
attention_mask=attention_mask, attention_mask=attention_mask,
) )
attn_output = attn_outputs[0] attn_output = attn_outputs[0]
outputs = attn_outputs[1:]
residual = hidden_states residual = hidden_states
layernorm_input = attn_output + residual layernorm_input = attn_output + residual
@ -207,8 +206,7 @@ class QWenBlock(nn.Module):
residual = layernorm_input residual = layernorm_input
mlp_output = self.mlp(layernorm_output) mlp_output = self.mlp(layernorm_output)
hidden_states = residual + mlp_output hidden_states = residual + mlp_output
outputs = (hidden_states,) + outputs return hidden_states
return outputs
class QWenPreTrainedModel(PreTrainedModel): class QWenPreTrainedModel(PreTrainedModel):
@ -312,16 +310,13 @@ class QWenModel(QWenPreTrainedModel):
hidden_states = self.drop(hidden_states) hidden_states = self.drop(hidden_states)
output_shape = input_shape + (hidden_states.size(-1),) output_shape = input_shape + (hidden_states.size(-1),)
presents = ()
all_hidden_states = None all_hidden_states = None
for i, block in enumerate(self.h): for block in self.h:
outputs = block( hidden_states = block(
hidden_states, hidden_states,
rotary_pos_emb_list=rotary_pos_emb_list, rotary_pos_emb_list=rotary_pos_emb_list,
attention_mask=attention_mask, attention_mask=attention_mask,
) )
hidden_states = outputs[0]
presents = presents + (outputs[1],)
hidden_states = self.ln_f(hidden_states) hidden_states = self.ln_f(hidden_states)
hidden_states = hidden_states.view(output_shape) hidden_states = hidden_states.view(output_shape)
@ -392,6 +387,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
attentions=transformer_outputs.attentions, attentions=transformer_outputs.attentions,
) )
@torch.no_grad()
def chat( def chat(
self, self,
tokenizer: PreTrainedTokenizer, tokenizer: PreTrainedTokenizer,
@ -454,15 +450,9 @@ class QWenLMHeadModel(QWenPreTrainedModel):
# 2. Set generation parameters if not already defined # 2. Set generation parameters if not already defined
if generation_config.pad_token_id is None and generation_config.eos_token_id is not None: if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
if model_kwargs.get("attention_mask", None) is None:
logger.warning(
"The attention mask and the pad token id were not set. As a consequence, you may observe "
"unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
)
eos_token_id = generation_config.eos_token_id eos_token_id = generation_config.eos_token_id
if isinstance(eos_token_id, list): if isinstance(eos_token_id, list):
eos_token_id = eos_token_id[0] eos_token_id = eos_token_id[0]
logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
generation_config.pad_token_id = eos_token_id generation_config.pad_token_id = eos_token_id
# 3. Define model inputs # 3. Define model inputs
@ -571,7 +561,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
if this_peer_finished: if this_peer_finished:
break break
return input_ids return input_ids

View File

@ -1 +1,2 @@
from tools import show from tools import show
from tools import mem_tracker

171
tools/mem_tracker.py Normal file
View File

@ -0,0 +1,171 @@
import gc
import datetime
import inspect
import torch
import numpy as np
import torch.nn as nn
dtype_memory_size_dict = {
torch.float64: 64 / 8,
torch.double: 64 / 8,
torch.float32: 32 / 8,
torch.float: 32 / 8,
torch.float16: 16 / 8,
torch.half: 16 / 8,
torch.int64: 64 / 8,
torch.long: 64 / 8,
torch.int32: 32 / 8,
torch.int: 32 / 8,
torch.int16: 16 / 8,
torch.short: 16 / 6,
torch.uint8: 8 / 8,
torch.int8: 8 / 8,
}
# compatibility of torch1.0
if getattr(torch, "bfloat16", None) is not None:
dtype_memory_size_dict[torch.bfloat16] = 16 / 8
if getattr(torch, "bool", None) is not None:
dtype_memory_size_dict[torch.bool] = (
8 / 8
) # pytorch use 1 byte for a bool, see https://github.com/pytorch/pytorch/issues/41571
def get_mem_space(x):
try:
ret = dtype_memory_size_dict[x]
except KeyError:
print(f"dtype {x} is not supported!")
return ret
class MemTracker(object):
"""
Class used to track pytorch memory usage
Arguments:
detail(bool, default True): whether the function shows the detail gpu memory usage
path(str): where to save log file
verbose(bool, default False): whether show the trivial exception
device(int): GPU number, default is 0
"""
def __init__(self, detail=True, path="", verbose=False, device=0):
self.print_detail = detail
self.last_tensor_sizes = set()
self.gpu_profile_fn = path + f"{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_track.txt"
self.verbose = verbose
self.begin = True
self.device = device
def get_tensors(self):
for obj in gc.get_objects():
try:
if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
tensor = obj
else:
continue
if tensor.is_cuda:
yield tensor
except Exception as e:
if self.verbose:
print("A trivial exception occured: {}".format(e))
def get_tensor_usage(self):
sizes = [np.prod(np.array(tensor.size())) * get_mem_space(tensor.dtype) for tensor in self.get_tensors()]
return np.sum(sizes) / 1024**2
def get_allocate_usage(self):
return torch.cuda.memory_allocated() / 1024**2
def clear_cache(self):
gc.collect()
torch.cuda.empty_cache()
def print_all_gpu_tensor(self, file=None):
for x in self.get_tensors():
print(x.size(), x.dtype, np.prod(np.array(x.size())) * get_mem_space(x.dtype) / 1024**2, file=file)
def track(self):
"""
Track the GPU memory usage
"""
frameinfo = inspect.stack()[1]
where_str = frameinfo.filename + " line " + str(frameinfo.lineno) + ": " + frameinfo.function
with open(self.gpu_profile_fn, "a+") as f:
if self.begin:
f.write(
f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |"
f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n"
)
self.begin = False
if self.print_detail is True:
ts_list = [(tensor.size(), tensor.dtype) for tensor in self.get_tensors()]
new_tensor_sizes = {
(
type(x),
tuple(x.size()),
ts_list.count((x.size(), x.dtype)),
np.prod(np.array(x.size())) * get_mem_space(x.dtype) / 1024**2,
x.dtype,
)
for x in self.get_tensors()
}
for t, s, n, m, data_type in new_tensor_sizes - self.last_tensor_sizes:
f.write(
f"+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n"
)
for t, s, n, m, data_type in self.last_tensor_sizes - new_tensor_sizes:
f.write(
f"- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n"
)
self.last_tensor_sizes = new_tensor_sizes
f.write(
f"\nAt {where_str:<50}"
f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n"
)
def ModelSize(model, input, type_size=4):
para = sum([np.prod(list(p.size())) for p in model.parameters()])
# print('Model {} : Number of params: {}'.format(model._get_name(), para))
print("Model {} : params: {:4f}M".format(model._get_name(), para * type_size / 1000 / 1000))
input_ = input.clone()
input_.requires_grad_(requires_grad=False)
mods = list(model.modules())
out_sizes = []
for i in range(1, len(mods)):
m = mods[i]
if isinstance(m, nn.ReLU):
if m.inplace:
continue
out = m(input_)
out_sizes.append(np.array(out.size()))
input_ = out
total_nums = 0
for i in range(len(out_sizes)):
s = out_sizes[i]
nums = np.prod(np.array(s))
total_nums += nums
# print('Model {} : Number of intermedite variables without backward: {}'.format(model._get_name(), total_nums))
# print('Model {} : Number of intermedite variables with backward: {}'.format(model._get_name(), total_nums*2))
print(
"Model {} : intermedite variables: {:3f} M (without backward)".format(
model._get_name(), total_nums * type_size / 1000 / 1000
)
)
print(
"Model {} : intermedite variables: {:3f} M (with backward)".format(
model._get_name(), total_nums * type_size * 2 / 1000 / 1000
)
)

View File

@ -1,5 +1,6 @@
import show import show
import torch import torch
import mem_tracker
# radata = torch.randn(8192, 128) # radata = torch.randn(8192, 128)
@ -14,7 +15,6 @@ radata = torch.randn(3,127,127)
show.DumpTensorToImage(radata, "test.png") show.DumpTensorToImage(radata, "test.png")
radata = torch.randn(127, 127) radata = torch.randn(127, 127)
show.DumpTensorToLog(radata, "test.log") show.DumpTensorToLog(radata, "test.log")
@ -22,3 +22,7 @@ show.DumpTensorToLog(radata, "test.log")
radata = torch.randn(127, 127) - 0.5 radata = torch.randn(127, 127) - 0.5
show.ProbGE0(radata) show.ProbGE0(radata)
show.DumpProb() show.DumpProb()
radata = torch.randn(127, 127).cuda()
tracker = mem_tracker.MemTracker()
tracker.track()