Compare commits
No commits in common. "f96bcc799c36219c76a3475e71f186845a53f61a" and "3233616aac23dc6b7b90d94594f434d896e5edae" have entirely different histories.
f96bcc799c
...
3233616aac
|
@ -1,3 +1,2 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
.vscode
|
.vscode
|
||||||
*.txt
|
|
15
qwen/demo.py
15
qwen/demo.py
|
@ -52,10 +52,11 @@ print(model)
|
||||||
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
||||||
model = model.from_pretrained(model_dir, config=config, device_map="auto", trust_remote_code=True)
|
model = model.from_pretrained(
|
||||||
|
model_dir, config=config, device_map="auto", trust_remote_code=True
|
||||||
# model = model.eval()
|
).train()
|
||||||
model = model.train() # control by @torch.no_grad()
|
# model.train()
|
||||||
|
# model.zero_grad()
|
||||||
|
|
||||||
# 可指定不同的生成长度、top_p等相关超参
|
# 可指定不同的生成长度、top_p等相关超参
|
||||||
# model.generation_config = GenerationConfig.from_pretrained(
|
# model.generation_config = GenerationConfig.from_pretrained(
|
||||||
|
@ -73,14 +74,16 @@ print(decode_tokens)
|
||||||
# 日本的首都东京。<|im_end|><|endoftext|>
|
# 日本的首都东京。<|im_end|><|endoftext|>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# # 第一轮对话
|
# # 第一轮对话
|
||||||
# response, history, decode_tokens = model.chat(tokenizer, "你好", "", history=None)
|
# response, history, decode_tokens = model.chat(tokenizer, "你好", "", history=None)
|
||||||
# print(decode_tokens)
|
# print(decode_tokens)
|
||||||
# # 你好!很高兴为你提供帮助。
|
# # 你好!很高兴为你提供帮助。
|
||||||
|
|
||||||
# 第二轮对话
|
# 第二轮对话
|
||||||
response, history, decode_tokens = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", "", history=None)
|
# response, history = model.chat(tokenizer, "给我讲一个年轻人奋斗创业最终取得成功的故事。", history=None)
|
||||||
print(response)
|
# print(response)
|
||||||
|
|
||||||
|
|
||||||
# <|im_start|>system
|
# <|im_start|>system
|
||||||
|
|
|
@ -41,10 +41,8 @@ import sys
|
||||||
|
|
||||||
sys.path.append("..")
|
sys.path.append("..")
|
||||||
from tools import show
|
from tools import show
|
||||||
from tools import mem_tracker
|
|
||||||
|
|
||||||
# tracker = mem_tracker.MemTracker()
|
logger = logging.get_logger(__name__)
|
||||||
# tracker.track()
|
|
||||||
|
|
||||||
|
|
||||||
class QWenAttention(nn.Module):
|
class QWenAttention(nn.Module):
|
||||||
|
@ -112,6 +110,8 @@ class QWenAttention(nn.Module):
|
||||||
query = apply_rotary_pos_emb(query, q_pos_emb)
|
query = apply_rotary_pos_emb(query, q_pos_emb)
|
||||||
key = apply_rotary_pos_emb(key, k_pos_emb)
|
key = apply_rotary_pos_emb(key, k_pos_emb)
|
||||||
|
|
||||||
|
present = (key, value)
|
||||||
|
|
||||||
key_size = key.size(1)
|
key_size = key.size(1)
|
||||||
if key_size > self.seq_length and not self.training:
|
if key_size > self.seq_length and not self.training:
|
||||||
seq_start = key.size(1) - query.size(1)
|
seq_start = key.size(1) - query.size(1)
|
||||||
|
@ -148,8 +148,8 @@ class QWenAttention(nn.Module):
|
||||||
attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2)
|
attn_output = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask).transpose(1, 2)
|
||||||
context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim)
|
context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim)
|
||||||
attn_output = self.c_proj(context_layer)
|
attn_output = self.c_proj(context_layer)
|
||||||
|
outputs = (attn_output, present)
|
||||||
return attn_output
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
class QWenMLP(nn.Module):
|
class QWenMLP(nn.Module):
|
||||||
|
@ -199,6 +199,7 @@ class QWenBlock(nn.Module):
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
)
|
)
|
||||||
attn_output = attn_outputs[0]
|
attn_output = attn_outputs[0]
|
||||||
|
outputs = attn_outputs[1:]
|
||||||
residual = hidden_states
|
residual = hidden_states
|
||||||
layernorm_input = attn_output + residual
|
layernorm_input = attn_output + residual
|
||||||
|
|
||||||
|
@ -206,7 +207,8 @@ class QWenBlock(nn.Module):
|
||||||
residual = layernorm_input
|
residual = layernorm_input
|
||||||
mlp_output = self.mlp(layernorm_output)
|
mlp_output = self.mlp(layernorm_output)
|
||||||
hidden_states = residual + mlp_output
|
hidden_states = residual + mlp_output
|
||||||
return hidden_states
|
outputs = (hidden_states,) + outputs
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
class QWenPreTrainedModel(PreTrainedModel):
|
class QWenPreTrainedModel(PreTrainedModel):
|
||||||
|
@ -310,13 +312,16 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
hidden_states = self.drop(hidden_states)
|
hidden_states = self.drop(hidden_states)
|
||||||
output_shape = input_shape + (hidden_states.size(-1),)
|
output_shape = input_shape + (hidden_states.size(-1),)
|
||||||
|
|
||||||
|
presents = ()
|
||||||
all_hidden_states = None
|
all_hidden_states = None
|
||||||
for block in self.h:
|
for i, block in enumerate(self.h):
|
||||||
hidden_states = block(
|
outputs = block(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
rotary_pos_emb_list=rotary_pos_emb_list,
|
rotary_pos_emb_list=rotary_pos_emb_list,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
)
|
)
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
presents = presents + (outputs[1],)
|
||||||
|
|
||||||
hidden_states = self.ln_f(hidden_states)
|
hidden_states = self.ln_f(hidden_states)
|
||||||
hidden_states = hidden_states.view(output_shape)
|
hidden_states = hidden_states.view(output_shape)
|
||||||
|
@ -387,7 +392,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
attentions=transformer_outputs.attentions,
|
attentions=transformer_outputs.attentions,
|
||||||
)
|
)
|
||||||
|
|
||||||
@torch.no_grad()
|
|
||||||
def chat(
|
def chat(
|
||||||
self,
|
self,
|
||||||
tokenizer: PreTrainedTokenizer,
|
tokenizer: PreTrainedTokenizer,
|
||||||
|
@ -450,9 +454,15 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
# 2. Set generation parameters if not already defined
|
# 2. Set generation parameters if not already defined
|
||||||
|
|
||||||
if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
|
if generation_config.pad_token_id is None and generation_config.eos_token_id is not None:
|
||||||
|
if model_kwargs.get("attention_mask", None) is None:
|
||||||
|
logger.warning(
|
||||||
|
"The attention mask and the pad token id were not set. As a consequence, you may observe "
|
||||||
|
"unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results."
|
||||||
|
)
|
||||||
eos_token_id = generation_config.eos_token_id
|
eos_token_id = generation_config.eos_token_id
|
||||||
if isinstance(eos_token_id, list):
|
if isinstance(eos_token_id, list):
|
||||||
eos_token_id = eos_token_id[0]
|
eos_token_id = eos_token_id[0]
|
||||||
|
logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
|
||||||
generation_config.pad_token_id = eos_token_id
|
generation_config.pad_token_id = eos_token_id
|
||||||
|
|
||||||
# 3. Define model inputs
|
# 3. Define model inputs
|
||||||
|
@ -561,6 +571,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
|
|
||||||
if this_peer_finished:
|
if this_peer_finished:
|
||||||
break
|
break
|
||||||
|
|
||||||
return input_ids
|
return input_ids
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,2 +1 @@
|
||||||
from tools import show
|
from tools import show
|
||||||
from tools import mem_tracker
|
|
|
@ -1,171 +0,0 @@
|
||||||
import gc
|
|
||||||
import datetime
|
|
||||||
import inspect
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import numpy as np
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
dtype_memory_size_dict = {
|
|
||||||
torch.float64: 64 / 8,
|
|
||||||
torch.double: 64 / 8,
|
|
||||||
torch.float32: 32 / 8,
|
|
||||||
torch.float: 32 / 8,
|
|
||||||
torch.float16: 16 / 8,
|
|
||||||
torch.half: 16 / 8,
|
|
||||||
torch.int64: 64 / 8,
|
|
||||||
torch.long: 64 / 8,
|
|
||||||
torch.int32: 32 / 8,
|
|
||||||
torch.int: 32 / 8,
|
|
||||||
torch.int16: 16 / 8,
|
|
||||||
torch.short: 16 / 6,
|
|
||||||
torch.uint8: 8 / 8,
|
|
||||||
torch.int8: 8 / 8,
|
|
||||||
}
|
|
||||||
# compatibility of torch1.0
|
|
||||||
if getattr(torch, "bfloat16", None) is not None:
|
|
||||||
dtype_memory_size_dict[torch.bfloat16] = 16 / 8
|
|
||||||
if getattr(torch, "bool", None) is not None:
|
|
||||||
dtype_memory_size_dict[torch.bool] = (
|
|
||||||
8 / 8
|
|
||||||
) # pytorch use 1 byte for a bool, see https://github.com/pytorch/pytorch/issues/41571
|
|
||||||
|
|
||||||
|
|
||||||
def get_mem_space(x):
|
|
||||||
try:
|
|
||||||
ret = dtype_memory_size_dict[x]
|
|
||||||
except KeyError:
|
|
||||||
print(f"dtype {x} is not supported!")
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
class MemTracker(object):
|
|
||||||
"""
|
|
||||||
Class used to track pytorch memory usage
|
|
||||||
Arguments:
|
|
||||||
detail(bool, default True): whether the function shows the detail gpu memory usage
|
|
||||||
path(str): where to save log file
|
|
||||||
verbose(bool, default False): whether show the trivial exception
|
|
||||||
device(int): GPU number, default is 0
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, detail=True, path="", verbose=False, device=0):
|
|
||||||
self.print_detail = detail
|
|
||||||
self.last_tensor_sizes = set()
|
|
||||||
self.gpu_profile_fn = path + f"{datetime.datetime.now():%d-%b-%y-%H:%M:%S}-gpu_mem_track.txt"
|
|
||||||
self.verbose = verbose
|
|
||||||
self.begin = True
|
|
||||||
self.device = device
|
|
||||||
|
|
||||||
def get_tensors(self):
|
|
||||||
for obj in gc.get_objects():
|
|
||||||
try:
|
|
||||||
if torch.is_tensor(obj) or (hasattr(obj, "data") and torch.is_tensor(obj.data)):
|
|
||||||
tensor = obj
|
|
||||||
else:
|
|
||||||
continue
|
|
||||||
if tensor.is_cuda:
|
|
||||||
yield tensor
|
|
||||||
except Exception as e:
|
|
||||||
if self.verbose:
|
|
||||||
print("A trivial exception occured: {}".format(e))
|
|
||||||
|
|
||||||
def get_tensor_usage(self):
|
|
||||||
sizes = [np.prod(np.array(tensor.size())) * get_mem_space(tensor.dtype) for tensor in self.get_tensors()]
|
|
||||||
return np.sum(sizes) / 1024**2
|
|
||||||
|
|
||||||
def get_allocate_usage(self):
|
|
||||||
return torch.cuda.memory_allocated() / 1024**2
|
|
||||||
|
|
||||||
def clear_cache(self):
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
def print_all_gpu_tensor(self, file=None):
|
|
||||||
for x in self.get_tensors():
|
|
||||||
print(x.size(), x.dtype, np.prod(np.array(x.size())) * get_mem_space(x.dtype) / 1024**2, file=file)
|
|
||||||
|
|
||||||
def track(self):
|
|
||||||
"""
|
|
||||||
Track the GPU memory usage
|
|
||||||
"""
|
|
||||||
frameinfo = inspect.stack()[1]
|
|
||||||
where_str = frameinfo.filename + " line " + str(frameinfo.lineno) + ": " + frameinfo.function
|
|
||||||
|
|
||||||
with open(self.gpu_profile_fn, "a+") as f:
|
|
||||||
if self.begin:
|
|
||||||
f.write(
|
|
||||||
f"GPU Memory Track | {datetime.datetime.now():%d-%b-%y-%H:%M:%S} |"
|
|
||||||
f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
|
|
||||||
f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n"
|
|
||||||
)
|
|
||||||
self.begin = False
|
|
||||||
|
|
||||||
if self.print_detail is True:
|
|
||||||
ts_list = [(tensor.size(), tensor.dtype) for tensor in self.get_tensors()]
|
|
||||||
new_tensor_sizes = {
|
|
||||||
(
|
|
||||||
type(x),
|
|
||||||
tuple(x.size()),
|
|
||||||
ts_list.count((x.size(), x.dtype)),
|
|
||||||
np.prod(np.array(x.size())) * get_mem_space(x.dtype) / 1024**2,
|
|
||||||
x.dtype,
|
|
||||||
)
|
|
||||||
for x in self.get_tensors()
|
|
||||||
}
|
|
||||||
for t, s, n, m, data_type in new_tensor_sizes - self.last_tensor_sizes:
|
|
||||||
f.write(
|
|
||||||
f"+ | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n"
|
|
||||||
)
|
|
||||||
for t, s, n, m, data_type in self.last_tensor_sizes - new_tensor_sizes:
|
|
||||||
f.write(
|
|
||||||
f"- | {str(n)} * Size:{str(s):<20} | Memory: {str(m*n)[:6]} M | {str(t):<20} | {data_type}\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.last_tensor_sizes = new_tensor_sizes
|
|
||||||
|
|
||||||
f.write(
|
|
||||||
f"\nAt {where_str:<50}"
|
|
||||||
f" Total Tensor Used Memory:{self.get_tensor_usage():<7.1f}Mb"
|
|
||||||
f" Total Allocated Memory:{self.get_allocate_usage():<7.1f}Mb\n\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ModelSize(model, input, type_size=4):
|
|
||||||
para = sum([np.prod(list(p.size())) for p in model.parameters()])
|
|
||||||
# print('Model {} : Number of params: {}'.format(model._get_name(), para))
|
|
||||||
print("Model {} : params: {:4f}M".format(model._get_name(), para * type_size / 1000 / 1000))
|
|
||||||
|
|
||||||
input_ = input.clone()
|
|
||||||
input_.requires_grad_(requires_grad=False)
|
|
||||||
|
|
||||||
mods = list(model.modules())
|
|
||||||
out_sizes = []
|
|
||||||
|
|
||||||
for i in range(1, len(mods)):
|
|
||||||
m = mods[i]
|
|
||||||
if isinstance(m, nn.ReLU):
|
|
||||||
if m.inplace:
|
|
||||||
continue
|
|
||||||
out = m(input_)
|
|
||||||
out_sizes.append(np.array(out.size()))
|
|
||||||
input_ = out
|
|
||||||
|
|
||||||
total_nums = 0
|
|
||||||
for i in range(len(out_sizes)):
|
|
||||||
s = out_sizes[i]
|
|
||||||
nums = np.prod(np.array(s))
|
|
||||||
total_nums += nums
|
|
||||||
|
|
||||||
# print('Model {} : Number of intermedite variables without backward: {}'.format(model._get_name(), total_nums))
|
|
||||||
# print('Model {} : Number of intermedite variables with backward: {}'.format(model._get_name(), total_nums*2))
|
|
||||||
print(
|
|
||||||
"Model {} : intermedite variables: {:3f} M (without backward)".format(
|
|
||||||
model._get_name(), total_nums * type_size / 1000 / 1000
|
|
||||||
)
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"Model {} : intermedite variables: {:3f} M (with backward)".format(
|
|
||||||
model._get_name(), total_nums * type_size * 2 / 1000 / 1000
|
|
||||||
)
|
|
||||||
)
|
|
|
@ -1,6 +1,5 @@
|
||||||
import show
|
import show
|
||||||
import torch
|
import torch
|
||||||
import mem_tracker
|
|
||||||
|
|
||||||
|
|
||||||
# radata = torch.randn(8192, 128)
|
# radata = torch.randn(8192, 128)
|
||||||
|
@ -15,6 +14,7 @@ radata = torch.randn(3, 127, 127)
|
||||||
show.DumpTensorToImage(radata, "test.png")
|
show.DumpTensorToImage(radata, "test.png")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
radata = torch.randn(127, 127)
|
radata = torch.randn(127, 127)
|
||||||
show.DumpTensorToLog(radata, "test.log")
|
show.DumpTensorToLog(radata, "test.log")
|
||||||
|
|
||||||
|
@ -22,7 +22,3 @@ show.DumpTensorToLog(radata, "test.log")
|
||||||
radata = torch.randn(127, 127) - 0.5
|
radata = torch.randn(127, 127) - 0.5
|
||||||
show.ProbGE0(radata)
|
show.ProbGE0(radata)
|
||||||
show.DumpProb()
|
show.DumpProb()
|
||||||
|
|
||||||
radata = torch.randn(127, 127).cuda()
|
|
||||||
tracker = mem_tracker.MemTracker()
|
|
||||||
tracker.track()
|
|
||||||
|
|
Loading…
Reference in New Issue