Delete unused files.
This commit is contained in:
parent
82ac3e4863
commit
aa2d3b96c4
|
@ -77,8 +77,6 @@ class QWenAttention(nn.Module):
|
||||||
config.hidden_size, self.projection_size, bias=not config.no_bias
|
config.hidden_size, self.projection_size, bias=not config.no_bias
|
||||||
)
|
)
|
||||||
|
|
||||||
self.is_fp32 = True
|
|
||||||
|
|
||||||
self.use_dynamic_ntk = config.use_dynamic_ntk
|
self.use_dynamic_ntk = config.use_dynamic_ntk
|
||||||
self.use_logn_attn = config.use_logn_attn
|
self.use_logn_attn = config.use_logn_attn
|
||||||
|
|
||||||
|
@ -301,29 +299,6 @@ class QWenPreTrainedModel(PreTrainedModel):
|
||||||
def __init__(self, *inputs, **kwargs):
|
def __init__(self, *inputs, **kwargs):
|
||||||
super().__init__(*inputs, **kwargs)
|
super().__init__(*inputs, **kwargs)
|
||||||
|
|
||||||
def _init_weights(self, module):
|
|
||||||
"""Initialize the weights."""
|
|
||||||
if isinstance(module, nn.Linear):
|
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
|
||||||
if module.bias is not None:
|
|
||||||
module.bias.data.zero_()
|
|
||||||
elif isinstance(module, nn.Embedding):
|
|
||||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
|
|
||||||
if module.padding_idx is not None:
|
|
||||||
module.weight.data[module.padding_idx].zero_()
|
|
||||||
elif isinstance(module, RMSNorm):
|
|
||||||
module.weight.data.fill_(1.0)
|
|
||||||
|
|
||||||
for name, p in module.named_parameters():
|
|
||||||
if name == "c_proj.weight":
|
|
||||||
p.data.normal_(
|
|
||||||
mean=0.0,
|
|
||||||
std=(
|
|
||||||
self.config.initializer_range
|
|
||||||
/ math.sqrt(2 * self.config.num_hidden_layers)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
class QWenModel(QWenPreTrainedModel):
|
class QWenModel(QWenPreTrainedModel):
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
|
@ -347,8 +322,6 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
dim = self.rotary_ndims if self.rotary_ndims is not None else config.kv_channels
|
dim = self.rotary_ndims if self.rotary_ndims is not None else config.kv_channels
|
||||||
self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
|
self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
|
||||||
|
|
||||||
self.is_fp32 = True
|
|
||||||
|
|
||||||
self.h = nn.ModuleList(
|
self.h = nn.ModuleList(
|
||||||
[QWenBlock(config) for i in range(config.num_hidden_layers)]
|
[QWenBlock(config) for i in range(config.num_hidden_layers)]
|
||||||
)
|
)
|
||||||
|
@ -375,19 +348,13 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
encoder_hidden_states: Optional[torch.Tensor] = None,
|
encoder_hidden_states: Optional[torch.Tensor] = None,
|
||||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None
|
||||||
output_hidden_states: Optional[bool] = None
|
|
||||||
):
|
):
|
||||||
output_attentions = (
|
output_attentions = (
|
||||||
output_attentions
|
output_attentions
|
||||||
if output_attentions is not None
|
if output_attentions is not None
|
||||||
else self.config.output_attentions
|
else self.config.output_attentions
|
||||||
)
|
)
|
||||||
output_hidden_states = (
|
|
||||||
output_hidden_states
|
|
||||||
if output_hidden_states is not None
|
|
||||||
else self.config.output_hidden_states
|
|
||||||
)
|
|
||||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||||
|
|
||||||
if input_ids is not None and inputs_embeds is not None:
|
if input_ids is not None and inputs_embeds is not None:
|
||||||
|
@ -408,8 +375,6 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
past_key_values = tuple([None] * len(self.h))
|
past_key_values = tuple([None] * len(self.h))
|
||||||
|
|
||||||
if attention_mask is not None:
|
if attention_mask is not None:
|
||||||
if batch_size <= 0:
|
|
||||||
raise ValueError("batch_size has to be defined and > 0")
|
|
||||||
attention_mask = attention_mask.view(batch_size, -1)
|
attention_mask = attention_mask.view(batch_size, -1)
|
||||||
attention_mask = attention_mask[:, None, None, :]
|
attention_mask = attention_mask[:, None, None, :]
|
||||||
attention_mask = attention_mask.to(dtype=self.dtype)
|
attention_mask = attention_mask.to(dtype=self.dtype)
|
||||||
|
@ -458,10 +423,8 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
|
|
||||||
presents = () if use_cache else None
|
presents = () if use_cache else None
|
||||||
all_self_attentions = () if output_attentions else None
|
all_self_attentions = () if output_attentions else None
|
||||||
all_hidden_states = () if output_hidden_states else None
|
all_hidden_states = None
|
||||||
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
|
for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
|
||||||
if output_hidden_states:
|
|
||||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
||||||
|
|
||||||
outputs = block(
|
outputs = block(
|
||||||
hidden_states,
|
hidden_states,
|
||||||
|
@ -486,10 +449,6 @@ class QWenModel(QWenPreTrainedModel):
|
||||||
|
|
||||||
hidden_states = self.ln_f(hidden_states)
|
hidden_states = self.ln_f(hidden_states)
|
||||||
hidden_states = hidden_states.view(output_shape)
|
hidden_states = hidden_states.view(output_shape)
|
||||||
# Add last hidden state
|
|
||||||
if output_hidden_states:
|
|
||||||
all_hidden_states = all_hidden_states + (hidden_states,)
|
|
||||||
|
|
||||||
return BaseModelOutputWithPast(
|
return BaseModelOutputWithPast(
|
||||||
last_hidden_state=hidden_states,
|
last_hidden_state=hidden_states,
|
||||||
past_key_values=presents,
|
past_key_values=presents,
|
||||||
|
@ -543,8 +502,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
encoder_attention_mask: Optional[torch.FloatTensor] = None,
|
||||||
labels: Optional[torch.LongTensor] = None,
|
labels: Optional[torch.LongTensor] = None,
|
||||||
use_cache: Optional[bool] = None,
|
use_cache: Optional[bool] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None
|
||||||
output_hidden_states: Optional[bool] = None,
|
|
||||||
) -> Union[Tuple, CausalLMOutputWithPast]:
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
||||||
|
|
||||||
|
|
||||||
|
@ -557,8 +515,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
encoder_attention_mask=encoder_attention_mask,
|
encoder_attention_mask=encoder_attention_mask,
|
||||||
use_cache=use_cache,
|
use_cache=use_cache,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions
|
||||||
output_hidden_states=output_hidden_states
|
|
||||||
)
|
)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
@ -727,28 +684,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
|
# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
|
||||||
self._validate_model_class()
|
self._validate_model_class()
|
||||||
|
|
||||||
# priority: `generation_config` argument > `model.generation_config` (the default generation config)
|
|
||||||
if generation_config is None:
|
|
||||||
# legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
|
|
||||||
# two conditions must be met
|
|
||||||
# 1) the generation config must have been created from the model config (`_from_model_config` field);
|
|
||||||
# 2) the generation config must have seen no modification since its creation (the hash is the same).
|
|
||||||
if (
|
|
||||||
self.generation_config._from_model_config
|
|
||||||
and self.generation_config._original_object_hash
|
|
||||||
== hash(self.generation_config)
|
|
||||||
):
|
|
||||||
new_generation_config = GenerationConfig.from_model_config(self.config)
|
|
||||||
if new_generation_config != self.generation_config:
|
|
||||||
warnings.warn(
|
|
||||||
"You have modified the pretrained model configuration to control generation. This is a"
|
|
||||||
" deprecated strategy to control generation and will be removed soon, in a future version."
|
|
||||||
" Please use and modify the model generation configuration (see"
|
|
||||||
" https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )"
|
|
||||||
)
|
|
||||||
self.generation_config = new_generation_config
|
|
||||||
generation_config = self.generation_config
|
|
||||||
|
|
||||||
generation_config = copy.deepcopy(generation_config)
|
generation_config = copy.deepcopy(generation_config)
|
||||||
model_kwargs = generation_config.update(
|
model_kwargs = generation_config.update(
|
||||||
**kwargs
|
**kwargs
|
||||||
|
@ -791,11 +726,8 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
|
inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
|
||||||
inputs, generation_config.bos_token_id, model_kwargs
|
inputs, generation_config.bos_token_id, model_kwargs
|
||||||
)
|
)
|
||||||
batch_size = inputs_tensor.shape[0]
|
|
||||||
|
|
||||||
# 4. Define other model kwargs
|
# 4. Define other model kwargs
|
||||||
model_kwargs["output_attentions"] = generation_config.output_attentions
|
model_kwargs["output_attentions"] = generation_config.output_attentions
|
||||||
model_kwargs["output_hidden_states"] = generation_config.output_hidden_states
|
|
||||||
# decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
|
# decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
|
||||||
# generating the first new token or not, and we only want to use the embeddings for the first new token)
|
# generating the first new token or not, and we only want to use the embeddings for the first new token)
|
||||||
if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
|
if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
|
||||||
|
@ -822,7 +754,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
)
|
)
|
||||||
|
|
||||||
# 5. Prepare `input_ids` which will be used for auto-regressive generation
|
# 5. Prepare `input_ids` which will be used for auto-regressive generation
|
||||||
|
|
||||||
input_ids = (
|
input_ids = (
|
||||||
inputs_tensor
|
inputs_tensor
|
||||||
if model_input_name == "input_ids"
|
if model_input_name == "input_ids"
|
||||||
|
@ -838,14 +769,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
kwargs.get("max_length") is None
|
kwargs.get("max_length") is None
|
||||||
and generation_config.max_length is not None
|
and generation_config.max_length is not None
|
||||||
)
|
)
|
||||||
if generation_config.max_new_tokens is not None:
|
|
||||||
if not has_default_max_length and generation_config.max_length is not None:
|
|
||||||
logger.warning(
|
|
||||||
f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(="
|
|
||||||
f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. "
|
|
||||||
"Please refer to the documentation for more information. "
|
|
||||||
"(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
|
|
||||||
)
|
|
||||||
generation_config.max_length = (
|
generation_config.max_length = (
|
||||||
generation_config.max_new_tokens + input_ids_length
|
generation_config.max_new_tokens + input_ids_length
|
||||||
)
|
)
|
||||||
|
@ -853,25 +776,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
generation_config, input_ids_length, has_default_max_length
|
generation_config, input_ids_length, has_default_max_length
|
||||||
)
|
)
|
||||||
|
|
||||||
# 7. determine generation mode
|
|
||||||
generation_mode = self._get_generation_mode(generation_config, assistant_model)
|
|
||||||
|
|
||||||
if streamer is not None and (generation_config.num_beams > 1):
|
|
||||||
raise ValueError(
|
|
||||||
"`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.device.type != input_ids.device.type:
|
|
||||||
warnings.warn(
|
|
||||||
"You are calling .generate() with the `input_ids` being on a device type different"
|
|
||||||
f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model"
|
|
||||||
f" is on {self.device.type}. You may experience unexpected behaviors or slower generation."
|
|
||||||
" Please make sure that you have put `input_ids` to the"
|
|
||||||
f" correct device by calling for example input_ids = input_ids.to('{self.device.type}') before"
|
|
||||||
" running `.generate()`.",
|
|
||||||
UserWarning,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 8. prepare distribution pre_processing samplers
|
# 8. prepare distribution pre_processing samplers
|
||||||
logits_processor = self._get_logits_processor(
|
logits_processor = self._get_logits_processor(
|
||||||
generation_config=generation_config,
|
generation_config=generation_config,
|
||||||
|
@ -925,7 +829,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
pad_token_id: Optional[int] = None,
|
pad_token_id: Optional[int] = None,
|
||||||
eos_token_id: Optional[Union[int, List[int]]] = None,
|
eos_token_id: Optional[Union[int, List[int]]] = None,
|
||||||
output_attentions: Optional[bool] = None,
|
output_attentions: Optional[bool] = None,
|
||||||
output_hidden_states: Optional[bool] = None,
|
|
||||||
output_scores: Optional[bool] = None,
|
output_scores: Optional[bool] = None,
|
||||||
synced_gpus: bool = False,
|
synced_gpus: bool = False,
|
||||||
streamer: Optional["BaseStreamer"] = None,
|
streamer: Optional["BaseStreamer"] = None,
|
||||||
|
@ -940,13 +843,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
if stopping_criteria is not None
|
if stopping_criteria is not None
|
||||||
else StoppingCriteriaList()
|
else StoppingCriteriaList()
|
||||||
)
|
)
|
||||||
# if max_length is not None:
|
|
||||||
# warnings.warn(
|
|
||||||
# "`max_length` is deprecated in this function, use"
|
|
||||||
# " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
|
|
||||||
# UserWarning,
|
|
||||||
# )
|
|
||||||
# stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
|
|
||||||
logits_warper = (
|
logits_warper = (
|
||||||
logits_warper if logits_warper is not None else LogitsProcessorList()
|
logits_warper if logits_warper is not None else LogitsProcessorList()
|
||||||
)
|
)
|
||||||
|
@ -977,11 +874,6 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
if output_attentions is not None
|
if output_attentions is not None
|
||||||
else self.generation_config.output_attentions
|
else self.generation_config.output_attentions
|
||||||
)
|
)
|
||||||
output_hidden_states = (
|
|
||||||
output_hidden_states
|
|
||||||
if output_hidden_states is not None
|
|
||||||
else self.generation_config.output_hidden_states
|
|
||||||
)
|
|
||||||
|
|
||||||
# init attention / hidden states / scores tuples
|
# init attention / hidden states / scores tuples
|
||||||
scores = None
|
scores = None
|
||||||
|
@ -1000,8 +892,7 @@ class QWenLMHeadModel(QWenPreTrainedModel):
|
||||||
# forward pass to get next token
|
# forward pass to get next token
|
||||||
outputs = self(
|
outputs = self(
|
||||||
**model_inputs,
|
**model_inputs,
|
||||||
output_attentions=output_attentions,
|
output_attentions=output_attentions
|
||||||
output_hidden_states=output_hidden_states,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
next_token_logits = outputs.logits[:, -1, :]
|
next_token_logits = outputs.logits[:, -1, :]
|
||||||
|
@ -1064,9 +955,6 @@ class RotaryEmbedding(torch.nn.Module):
|
||||||
self.base = base
|
self.base = base
|
||||||
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
|
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
|
||||||
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
self.register_buffer("inv_freq", inv_freq, persistent=False)
|
||||||
if importlib.util.find_spec("einops") is None:
|
|
||||||
raise RuntimeError("einops is required for Rotary Embedding")
|
|
||||||
|
|
||||||
self._rotary_pos_emb_cache = None
|
self._rotary_pos_emb_cache = None
|
||||||
self._seq_len_cached = 0
|
self._seq_len_cached = 0
|
||||||
self._ntk_alpha_cached = 1.0
|
self._ntk_alpha_cached = 1.0
|
||||||
|
@ -1110,14 +998,7 @@ def _rotate_half(x):
|
||||||
|
|
||||||
|
|
||||||
def apply_rotary_pos_emb(t, freqs):
|
def apply_rotary_pos_emb(t, freqs):
|
||||||
"""Apply rotary embedding to the first rotary_dim of the iput
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
t (tensor(batch_size, seq_len, n_head, head_dim)):
|
|
||||||
the input embedding/hidden states
|
|
||||||
freqs (list[tensor(1, seq_len, 1, rotary_dim), tensor(1, seq_len, 1, rotary_dim)]):
|
|
||||||
the cached cos/sin position embeddings
|
|
||||||
"""
|
|
||||||
rot_dim = freqs[0].shape[-1]
|
rot_dim = freqs[0].shape[-1]
|
||||||
cos, sin = freqs
|
cos, sin = freqs
|
||||||
t_float = t.float()
|
t_float = t.float()
|
||||||
|
|
Loading…
Reference in New Issue