更新 Readme.md
This commit is contained in:
parent
dff2b9231f
commit
d9b64e4025
10
Readme.md
10
Readme.md
|
@ -3,6 +3,8 @@
|
||||||
|
|
||||||
## data flow
|
## data flow
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
query
|
query
|
||||||
|
|
|
|
||||||
tokenizer -> input_ids
|
tokenizer -> input_ids
|
||||||
|
@ -47,9 +49,11 @@ for:
|
||||||
input_ids = torch.cat([input_ids, next_tokens]) -> [1, 7] 1:batch_num
|
input_ids = torch.cat([input_ids, next_tokens]) -> [1, 7] 1:batch_num
|
||||||
|
|
||||||
response = tokenizer.decode(outputs)
|
response = tokenizer.decode(outputs)
|
||||||
|
```
|
||||||
|
|
||||||
## RMSNorm
|
## RMSNorm
|
||||||
|
|
||||||
|
```
|
||||||
hidden_states -> [6, 1, 4096]
|
hidden_states -> [6, 1, 4096]
|
||||||
/ \
|
/ \
|
||||||
| pow(2) -> [6, 1, 4096]
|
| pow(2) -> [6, 1, 4096]
|
||||||
|
@ -68,9 +72,11 @@ variance = hidden_states.pow(2).mean(-1, keepdim=True) -> [6, 1, 1]
|
||||||
hidden_states = hidden_states * torch.rsqrt(variance + self.eps) 平方根倒数
|
hidden_states = hidden_states * torch.rsqrt(variance + self.eps) 平方根倒数
|
||||||
self.weight -> [4096]
|
self.weight -> [4096]
|
||||||
return (self.weight * hidden_states) -> [6, 1, 4096]
|
return (self.weight * hidden_states) -> [6, 1, 4096]
|
||||||
|
```
|
||||||
|
|
||||||
## MLP
|
## MLP
|
||||||
|
|
||||||
|
```
|
||||||
hidden_states -> [6, 1, 4096]
|
hidden_states -> [6, 1, 4096]
|
||||||
Linear -> [6, 1, 27392]
|
Linear -> [6, 1, 27392]
|
||||||
/ \
|
/ \
|
||||||
|
@ -86,9 +92,10 @@ return (self.weight * hidden_states) -> [6, 1, 4096]
|
||||||
Linear(hidden_states) no bias -> [6, 1, 27392]
|
Linear(hidden_states) no bias -> [6, 1, 27392]
|
||||||
silu (x) = [6, 1, 13696] * sigmoid([6, 1, 13696])
|
silu (x) = [6, 1, 13696] * sigmoid([6, 1, 13696])
|
||||||
Linear(intermediate_parallel) no bias -> [6, 1, 4096]
|
Linear(intermediate_parallel) no bias -> [6, 1, 4096]
|
||||||
|
```
|
||||||
|
|
||||||
## self_attention
|
## self_attention
|
||||||
|
```
|
||||||
x -> [6, 1, 4096]
|
x -> [6, 1, 4096]
|
||||||
|
|
|
|
||||||
Linear -> [6, 1, 4608]
|
Linear -> [6, 1, 4608]
|
||||||
|
@ -134,6 +141,7 @@ context_layer = scaled_dot_product_attention(query_layer, key_layer, value_layer
|
||||||
context_layer = context_layer.permute(2, 0, 1, 3).reshape() -> [6, 1, 4096]
|
context_layer = context_layer.permute(2, 0, 1, 3).reshape() -> [6, 1, 4096]
|
||||||
|
|
||||||
return Linear(context_layer) -> [6, 1, 4096]
|
return Linear(context_layer) -> [6, 1, 4096]
|
||||||
|
```
|
||||||
|
|
||||||
## GLMBlock
|
## GLMBlock
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue