mirror of
https://github.com/HackTricks-wiki/hacktricks.git
synced 2025-10-10 18:36:50 +00:00
672 lines
27 KiB
Markdown
672 lines
27 KiB
Markdown
# 5. LLM Architecture
|
|
|
|
{{#include ../../banners/hacktricks-training.md}}
|
|
|
|
## LLM Architecture
|
|
|
|
> [!TIP]
|
|
> Lengo la awamu hii ya tano ni rahisi sana: **Kuunda usanifu wa LLM kamili**. Panga kila kitu pamoja, tumia tabaka zote na uunde kazi zote za kuzalisha maandiko au kubadilisha maandiko kuwa IDs na kinyume chake.
|
|
>
|
|
> Usanifu huu utatumika kwa mafunzo na kutabiri maandiko baada ya kufundishwa.
|
|
|
|
Mfano wa usanifu wa LLM kutoka [https://github.com/rasbt/LLMs-from-scratch/blob/main/ch04/01_main-chapter-code/ch04.ipynb](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch04/01_main-chapter-code/ch04.ipynb):
|
|
|
|
Mwakilishi wa kiwango cha juu unaweza kuonekana katika:
|
|
|
|
<figure><img src="../../images/image (3) (1) (1) (1).png" alt="" width="563"><figcaption><p><a href="https://camo.githubusercontent.com/6c8c392f72d5b9e86c94aeb9470beab435b888d24135926f1746eb88e0cc18fb/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31332e776562703f31">https://camo.githubusercontent.com/6c8c392f72d5b9e86c94aeb9470beab435b888d24135926f1746eb88e0cc18fb/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31332e776562703f31</a></p></figcaption></figure>
|
|
|
|
1. **Input (Tokenized Text)**: Mchakato huanza na maandiko yaliyotolewa token, ambayo yanabadilishwa kuwa uwakilishi wa nambari.
|
|
2. **Token Embedding and Positional Embedding Layer**: Maandiko yaliyotolewa token yanapita kupitia **token embedding** layer na **positional embedding layer**, ambayo inashika nafasi ya tokens katika mfuatano, muhimu kwa kuelewa mpangilio wa maneno.
|
|
3. **Transformer Blocks**: Mfano una **12 transformer blocks**, kila moja ikiwa na tabaka nyingi. Blocks hizi hurudia mfuatano ufuatao:
|
|
- **Masked Multi-Head Attention**: Inaruhusu mfano kuzingatia sehemu tofauti za maandiko ya ingizo kwa wakati mmoja.
|
|
- **Layer Normalization**: Hatua ya kawaida ili kuimarisha na kuboresha mafunzo.
|
|
- **Feed Forward Layer**: Inawajibika kwa kuchakata habari kutoka kwa tabaka la umakini na kufanya utabiri kuhusu token inayofuata.
|
|
- **Dropout Layers**: Tabaka hizi zinazuia overfitting kwa kuangusha vitengo kwa bahati nasibu wakati wa mafunzo.
|
|
4. **Final Output Layer**: Mfano unatoa **4x50,257-dimensional tensor**, ambapo **50,257** inawakilisha ukubwa wa msamiati. Kila safu katika tensor hii inahusiana na vector ambayo mfano hutumia kutabiri neno linalofuata katika mfuatano.
|
|
5. **Goal**: Lengo ni kuchukua embeddings hizi na kuzibadilisha tena kuwa maandiko. Kwa hakika, safu ya mwisho ya matokeo inatumika kuzalisha neno linalofuata, linalowakilishwa kama "forward" katika mchoro huu.
|
|
|
|
### Code representation
|
|
```python
|
|
import torch
|
|
import torch.nn as nn
|
|
import tiktoken
|
|
|
|
class GELU(nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def forward(self, x):
|
|
return 0.5 * x * (1 + torch.tanh(
|
|
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
|
|
(x + 0.044715 * torch.pow(x, 3))
|
|
))
|
|
|
|
class FeedForward(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.layers = nn.Sequential(
|
|
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
|
|
GELU(),
|
|
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
|
|
)
|
|
|
|
def forward(self, x):
|
|
return self.layers(x)
|
|
|
|
class MultiHeadAttention(nn.Module):
|
|
def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
|
|
super().__init__()
|
|
assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
|
|
|
|
self.d_out = d_out
|
|
self.num_heads = num_heads
|
|
self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
|
|
|
|
self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
|
|
self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
|
|
self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
|
|
self.out_proj = nn.Linear(d_out, d_out) # Linear layer to combine head outputs
|
|
self.dropout = nn.Dropout(dropout)
|
|
self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
|
|
|
|
def forward(self, x):
|
|
b, num_tokens, d_in = x.shape
|
|
|
|
keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
|
|
queries = self.W_query(x)
|
|
values = self.W_value(x)
|
|
|
|
# We implicitly split the matrix by adding a `num_heads` dimension
|
|
# Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
|
|
keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
|
|
values = values.view(b, num_tokens, self.num_heads, self.head_dim)
|
|
queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
|
|
|
|
# Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
|
|
keys = keys.transpose(1, 2)
|
|
queries = queries.transpose(1, 2)
|
|
values = values.transpose(1, 2)
|
|
|
|
# Compute scaled dot-product attention (aka self-attention) with a causal mask
|
|
attn_scores = queries @ keys.transpose(2, 3) # Dot product for each head
|
|
|
|
# Original mask truncated to the number of tokens and converted to boolean
|
|
mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
|
|
|
|
# Use the mask to fill attention scores
|
|
attn_scores.masked_fill_(mask_bool, -torch.inf)
|
|
|
|
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
|
|
attn_weights = self.dropout(attn_weights)
|
|
|
|
# Shape: (b, num_tokens, num_heads, head_dim)
|
|
context_vec = (attn_weights @ values).transpose(1, 2)
|
|
|
|
# Combine heads, where self.d_out = self.num_heads * self.head_dim
|
|
context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
|
|
context_vec = self.out_proj(context_vec) # optional projection
|
|
|
|
return context_vec
|
|
|
|
class LayerNorm(nn.Module):
|
|
def __init__(self, emb_dim):
|
|
super().__init__()
|
|
self.eps = 1e-5
|
|
self.scale = nn.Parameter(torch.ones(emb_dim))
|
|
self.shift = nn.Parameter(torch.zeros(emb_dim))
|
|
|
|
def forward(self, x):
|
|
mean = x.mean(dim=-1, keepdim=True)
|
|
var = x.var(dim=-1, keepdim=True, unbiased=False)
|
|
norm_x = (x - mean) / torch.sqrt(var + self.eps)
|
|
return self.scale * norm_x + self.shift
|
|
|
|
class TransformerBlock(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.att = MultiHeadAttention(
|
|
d_in=cfg["emb_dim"],
|
|
d_out=cfg["emb_dim"],
|
|
context_length=cfg["context_length"],
|
|
num_heads=cfg["n_heads"],
|
|
dropout=cfg["drop_rate"],
|
|
qkv_bias=cfg["qkv_bias"])
|
|
self.ff = FeedForward(cfg)
|
|
self.norm1 = LayerNorm(cfg["emb_dim"])
|
|
self.norm2 = LayerNorm(cfg["emb_dim"])
|
|
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
|
|
|
def forward(self, x):
|
|
# Shortcut connection for attention block
|
|
shortcut = x
|
|
x = self.norm1(x)
|
|
x = self.att(x) # Shape [batch_size, num_tokens, emb_size]
|
|
x = self.drop_shortcut(x)
|
|
x = x + shortcut # Add the original input back
|
|
|
|
# Shortcut connection for feed forward block
|
|
shortcut = x
|
|
x = self.norm2(x)
|
|
x = self.ff(x)
|
|
x = self.drop_shortcut(x)
|
|
x = x + shortcut # Add the original input back
|
|
|
|
return x
|
|
|
|
|
|
class GPTModel(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
|
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
|
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
|
|
|
self.trf_blocks = nn.Sequential(
|
|
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
|
|
|
|
self.final_norm = LayerNorm(cfg["emb_dim"])
|
|
self.out_head = nn.Linear(
|
|
cfg["emb_dim"], cfg["vocab_size"], bias=False
|
|
)
|
|
|
|
def forward(self, in_idx):
|
|
batch_size, seq_len = in_idx.shape
|
|
tok_embeds = self.tok_emb(in_idx)
|
|
pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
|
|
x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]
|
|
x = self.drop_emb(x)
|
|
x = self.trf_blocks(x)
|
|
x = self.final_norm(x)
|
|
logits = self.out_head(x)
|
|
return logits
|
|
|
|
GPT_CONFIG_124M = {
|
|
"vocab_size": 50257, # Vocabulary size
|
|
"context_length": 1024, # Context length
|
|
"emb_dim": 768, # Embedding dimension
|
|
"n_heads": 12, # Number of attention heads
|
|
"n_layers": 12, # Number of layers
|
|
"drop_rate": 0.1, # Dropout rate
|
|
"qkv_bias": False # Query-Key-Value bias
|
|
}
|
|
|
|
torch.manual_seed(123)
|
|
model = GPTModel(GPT_CONFIG_124M)
|
|
out = model(batch)
|
|
print("Input batch:\n", batch)
|
|
print("\nOutput shape:", out.shape)
|
|
print(out)
|
|
```
|
|
### **Kazi ya Kuamsha GELU**
|
|
```python
|
|
# From https://github.com/rasbt/LLMs-from-scratch/tree/main/ch04
|
|
class GELU(nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def forward(self, x):
|
|
return 0.5 * x * (1 + torch.tanh(
|
|
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
|
|
(x + 0.044715 * torch.pow(x, 3))
|
|
))
|
|
```
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **GELU (Gaussian Error Linear Unit):** Kazi ya kuamsha ambayo inaingiza kutokuwa na mstari ndani ya mfano.
|
|
- **Kuamsha kwa Ufanisi:** Tofauti na ReLU, ambayo inafuta maingizo hasi, GELU inachora kwa laini maingizo kuwa matokeo, ikiruhusu thamani ndogo, zisizo za sifuri kwa maingizo hasi.
|
|
- **Mwelekeo wa Kihesabu:**
|
|
|
|
<figure><img src="../../images/image (2) (1) (1) (1).png" alt=""><figcaption></figcaption></figure>
|
|
|
|
> [!TIP]
|
|
> Lengo la matumizi ya kazi hii baada ya tabaka za mstari ndani ya tabaka la FeedForward ni kubadilisha data za mstari kuwa zisizo za mstari ili kuruhusu mfano kujifunza uhusiano tata, zisizo za mstari.
|
|
|
|
### **Mtandao wa Neva wa FeedForward**
|
|
|
|
_Mifano imeongezwa kama maoni ili kuelewa vyema mifano ya matrices:_
|
|
```python
|
|
# From https://github.com/rasbt/LLMs-from-scratch/tree/main/ch04
|
|
class FeedForward(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.layers = nn.Sequential(
|
|
nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
|
|
GELU(),
|
|
nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
|
|
)
|
|
|
|
def forward(self, x):
|
|
# x shape: (batch_size, seq_len, emb_dim)
|
|
|
|
x = self.layers[0](x)# x shape: (batch_size, seq_len, 4 * emb_dim)
|
|
x = self.layers[1](x) # x shape remains: (batch_size, seq_len, 4 * emb_dim)
|
|
x = self.layers[2](x) # x shape: (batch_size, seq_len, emb_dim)
|
|
return x # Output shape: (batch_size, seq_len, emb_dim)
|
|
```
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **Mtandao wa FeedForward Kulingana na Nafasi:** Inatumia mtandao wa viwango viwili vilivyounganishwa kwa kila nafasi tofauti na kwa njia sawa.
|
|
- **Maelezo ya Viwango:**
|
|
- **Kiwango cha Kwanza cha Mstari:** Kinapanua ukubwa kutoka `emb_dim` hadi `4 * emb_dim`.
|
|
- **Kazi ya GELU:** Inatumia kutokuwa na mstari.
|
|
- **Kiwango cha Pili cha Mstari:** Kinapunguza ukubwa kurudi kwenye `emb_dim`.
|
|
|
|
> [!TIP]
|
|
> Kama unavyoona, mtandao wa Feed Forward unatumia viwango 3. Kiwango cha kwanza ni kiwango cha mstari ambacho kitazidisha ukubwa kwa 4 kwa kutumia uzito wa mstari (vigezo vya kufundisha ndani ya mfano). Kisha, kazi ya GELU inatumika katika ukubwa wote ili kuleta mabadiliko yasiyo ya mstari ili kupata uwakilishi mzuri zaidi na hatimaye kiwango kingine cha mstari kinatumika kurudi kwenye ukubwa wa awali wa ukubwa.
|
|
|
|
### **Mekanismu ya Umakini wa Vichwa Vingi**
|
|
|
|
Hii tayari ilielezwa katika sehemu ya awali.
|
|
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **Umakini wa Kujitenga wa Vichwa Vingi:** Inaruhusu mfano kuzingatia nafasi tofauti ndani ya mlolongo wa ingizo wakati wa kuandika token.
|
|
- **Vipengele Muhimu:**
|
|
- **Maswali, Funguo, Thamani:** Mipango ya mstari ya ingizo, inayotumika kuhesabu alama za umakini.
|
|
- **Vichwa:** Mekanismu nyingi za umakini zinazoendesha kwa sambamba (`num_heads`), kila moja ikiwa na ukubwa mdogo (`head_dim`).
|
|
- **Alama za Umakini:** Zinahesabiwa kama bidhaa ya dot ya maswali na funguo, zimepimwa na kufichwa.
|
|
- **Kuficha:** Mask ya sababu inatumika kuzuia mfano kuzingatia token za baadaye (muhimu kwa mifano ya autoregressive kama GPT).
|
|
- **Uzito wa Umakini:** Softmax ya alama za umakini zilizofichwa na kupimwa.
|
|
- **Vector ya Muktadha:** Jumla yenye uzito ya thamani, kulingana na uzito wa umakini.
|
|
- **Mipango ya Matokeo:** Kiwango cha mstari cha kuunganisha matokeo ya vichwa vyote.
|
|
|
|
> [!TIP]
|
|
> Lengo la mtandao huu ni kupata uhusiano kati ya token katika muktadha sawa. Aidha, token zimegawanywa katika vichwa tofauti ili kuzuia overfitting ingawa uhusiano wa mwisho uliopatikana kwa kila kichwa unachanganywa mwishoni mwa mtandao huu.
|
|
>
|
|
> Aidha, wakati wa mafunzo **mask ya sababu** inatumika ili token za baadaye zisihesabiwe wakati wa kutafuta uhusiano maalum kwa token na **dropout** pia inatumika ili **kuzuia overfitting**.
|
|
|
|
### **Kiwango** Kurekebisha
|
|
```python
|
|
# From https://github.com/rasbt/LLMs-from-scratch/tree/main/ch04
|
|
class LayerNorm(nn.Module):
|
|
def __init__(self, emb_dim):
|
|
super().__init__()
|
|
self.eps = 1e-5 # Prevent division by zero during normalization.
|
|
self.scale = nn.Parameter(torch.ones(emb_dim))
|
|
self.shift = nn.Parameter(torch.zeros(emb_dim))
|
|
|
|
def forward(self, x):
|
|
mean = x.mean(dim=-1, keepdim=True)
|
|
var = x.var(dim=-1, keepdim=True, unbiased=False)
|
|
norm_x = (x - mean) / torch.sqrt(var + self.eps)
|
|
return self.scale * norm_x + self.shift
|
|
```
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **Layer Normalization:** Mbinu inayotumika kurekebisha ingizo kati ya vipengele (embedding dimensions) kwa kila mfano binafsi katika kundi.
|
|
- **Vipengele:**
|
|
- **`eps`:** Kiwango kidogo (`1e-5`) kinachoongezwa kwa variance ili kuzuia kugawanya kwa sifuri wakati wa normalization.
|
|
- **`scale` na `shift`:** Vigezo vinavyoweza kujifunza (`nn.Parameter`) vinavyomruhusu mfano kupima na kuhamasisha matokeo yaliyorekebishwa. Vimeanzishwa kuwa moja na sifuri, mtawalia.
|
|
- **Mchakato wa Kurekebisha:**
|
|
- **Hesabu Mean (`mean`):** Hesabu mean ya ingizo `x` kati ya dimension ya embedding (`dim=-1`), ikihifadhi dimension kwa ajili ya broadcasting (`keepdim=True`).
|
|
- **Hesabu Variance (`var`):** Hesabu variance ya `x` kati ya dimension ya embedding, pia ikihifadhi dimension. Paramenta `unbiased=False` inahakikisha kuwa variance inahesabiwa kwa kutumia mhesabu wa biased (kugawanya kwa `N` badala ya `N-1`), ambayo ni sahihi wakati wa kurekebisha juu ya vipengele badala ya sampuli.
|
|
- **Normalize (`norm_x`):** Inapunguza mean kutoka `x` na kugawanya kwa mzizi wa variance pamoja na `eps`.
|
|
- **Scale na Shift:** Inatumia vigezo vinavyoweza kujifunza `scale` na `shift` kwa matokeo yaliyorekebishwa.
|
|
|
|
> [!TIP]
|
|
> Lengo ni kuhakikisha mean ya 0 na variance ya 1 kati ya dimensions zote za token sawa. Lengo la hili ni **kuimarisha mafunzo ya mitandao ya neva ya kina** kwa kupunguza mabadiliko ya ndani ya covariate, ambayo inahusisha mabadiliko katika usambazaji wa uhamasishaji wa mtandao kutokana na kubadilisha vigezo wakati wa mafunzo.
|
|
|
|
### **Transformer Block**
|
|
|
|
_Shapes zimeongezwa kama maoni ili kuelewa vyema shapes za matrices:_
|
|
```python
|
|
# From https://github.com/rasbt/LLMs-from-scratch/tree/main/ch04
|
|
|
|
class TransformerBlock(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.att = MultiHeadAttention(
|
|
d_in=cfg["emb_dim"],
|
|
d_out=cfg["emb_dim"],
|
|
context_length=cfg["context_length"],
|
|
num_heads=cfg["n_heads"],
|
|
dropout=cfg["drop_rate"],
|
|
qkv_bias=cfg["qkv_bias"]
|
|
)
|
|
self.ff = FeedForward(cfg)
|
|
self.norm1 = LayerNorm(cfg["emb_dim"])
|
|
self.norm2 = LayerNorm(cfg["emb_dim"])
|
|
self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
|
|
|
|
def forward(self, x):
|
|
# x shape: (batch_size, seq_len, emb_dim)
|
|
|
|
# Shortcut connection for attention block
|
|
shortcut = x # shape: (batch_size, seq_len, emb_dim)
|
|
x = self.norm1(x) # shape remains (batch_size, seq_len, emb_dim)
|
|
x = self.att(x) # shape: (batch_size, seq_len, emb_dim)
|
|
x = self.drop_shortcut(x) # shape remains (batch_size, seq_len, emb_dim)
|
|
x = x + shortcut # shape: (batch_size, seq_len, emb_dim)
|
|
|
|
# Shortcut connection for feedforward block
|
|
shortcut = x # shape: (batch_size, seq_len, emb_dim)
|
|
x = self.norm2(x) # shape remains (batch_size, seq_len, emb_dim)
|
|
x = self.ff(x) # shape: (batch_size, seq_len, emb_dim)
|
|
x = self.drop_shortcut(x) # shape remains (batch_size, seq_len, emb_dim)
|
|
x = x + shortcut # shape: (batch_size, seq_len, emb_dim)
|
|
|
|
return x # Output shape: (batch_size, seq_len, emb_dim)
|
|
|
|
```
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **Muundo wa Tabaka:** Inachanganya umakini wa vichwa vingi, mtandao wa feedforward, urekebishaji wa tabaka, na muunganisho wa ziada.
|
|
- **Urekebishaji wa Tabaka:** Unatumika kabla ya tabaka za umakini na feedforward kwa mafunzo thabiti.
|
|
- **Muunganisho wa Ziada (Njia Fupi):** Ongeza ingizo la tabaka kwa matokeo yake ili kuboresha mtiririko wa gradient na kuwezesha mafunzo ya mitandao mirefu.
|
|
- **Dropout:** Unatumika baada ya tabaka za umakini na feedforward kwa ajili ya urekebishaji.
|
|
|
|
#### **Ufanisi wa Hatua kwa Hatua**
|
|
|
|
1. **Njia ya Kwanza ya Ziada (Umakini wa Kibinafsi):**
|
|
- **Ingizo (`shortcut`):** Hifadhi ingizo la awali kwa muunganisho wa ziada.
|
|
- **Urekebishaji wa Tabaka (`norm1`):** Rekebisha ingizo.
|
|
- **Umakini wa Vichwa Vingi (`att`):** Tumia umakini wa kibinafsi.
|
|
- **Dropout (`drop_shortcut`):** Tumia dropout kwa urekebishaji.
|
|
- **Ongeza Ziada (`x + shortcut`):** Changanya na ingizo la awali.
|
|
2. **Njia ya Pili ya Ziada (FeedForward):**
|
|
- **Ingizo (`shortcut`):** Hifadhi ingizo lililosasishwa kwa muunganisho wa ziada unaofuata.
|
|
- **Urekebishaji wa Tabaka (`norm2`):** Rekebisha ingizo.
|
|
- **Mtandao wa FeedForward (`ff`):** Tumia mabadiliko ya feedforward.
|
|
- **Dropout (`drop_shortcut`):** Tumia dropout.
|
|
- **Ongeza Ziada (`x + shortcut`):** Changanya na ingizo kutoka kwa njia ya kwanza ya ziada.
|
|
|
|
> [!TIP]
|
|
> Block ya transformer inakusanya mitandao yote pamoja na kutumia **urekebishaji** na **dropouts** kuboresha utulivu wa mafunzo na matokeo.\
|
|
> Kumbuka jinsi dropouts zinavyofanywa baada ya matumizi ya kila mtandao wakati urekebishaji unatumika kabla.
|
|
>
|
|
> Zaidi ya hayo, inatumia njia fupi ambazo zinajumuisha **kuongeza matokeo ya mtandao na ingizo lake**. Hii husaidia kuzuia tatizo la gradient inayopotea kwa kuhakikisha kwamba tabaka za mwanzo zinachangia "kiasi" sawa na zile za mwisho.
|
|
|
|
### **GPTModel**
|
|
|
|
_Mifano imeongezwa kama maoni ili kuelewa vyema mifano ya matrices:_
|
|
```python
|
|
# From https://github.com/rasbt/LLMs-from-scratch/tree/main/ch04
|
|
class GPTModel(nn.Module):
|
|
def __init__(self, cfg):
|
|
super().__init__()
|
|
self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
|
|
# shape: (vocab_size, emb_dim)
|
|
|
|
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
|
# shape: (context_length, emb_dim)
|
|
|
|
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
|
|
|
self.trf_blocks = nn.Sequential(
|
|
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
|
|
)
|
|
# Stack of TransformerBlocks
|
|
|
|
self.final_norm = LayerNorm(cfg["emb_dim"])
|
|
self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
|
|
# shape: (emb_dim, vocab_size)
|
|
|
|
def forward(self, in_idx):
|
|
# in_idx shape: (batch_size, seq_len)
|
|
batch_size, seq_len = in_idx.shape
|
|
|
|
# Token embeddings
|
|
tok_embeds = self.tok_emb(in_idx)
|
|
# shape: (batch_size, seq_len, emb_dim)
|
|
|
|
# Positional embeddings
|
|
pos_indices = torch.arange(seq_len, device=in_idx.device)
|
|
# shape: (seq_len,)
|
|
pos_embeds = self.pos_emb(pos_indices)
|
|
# shape: (seq_len, emb_dim)
|
|
|
|
# Add token and positional embeddings
|
|
x = tok_embeds + pos_embeds # Broadcasting over batch dimension
|
|
# x shape: (batch_size, seq_len, emb_dim)
|
|
|
|
x = self.drop_emb(x) # Dropout applied
|
|
# x shape remains: (batch_size, seq_len, emb_dim)
|
|
|
|
x = self.trf_blocks(x) # Pass through Transformer blocks
|
|
# x shape remains: (batch_size, seq_len, emb_dim)
|
|
|
|
x = self.final_norm(x) # Final LayerNorm
|
|
# x shape remains: (batch_size, seq_len, emb_dim)
|
|
|
|
logits = self.out_head(x) # Project to vocabulary size
|
|
# logits shape: (batch_size, seq_len, vocab_size)
|
|
|
|
return logits # Output shape: (batch_size, seq_len, vocab_size)
|
|
```
|
|
#### **Madhumuni na Ufanisi**
|
|
|
|
- **Embedding Layers:**
|
|
- **Token Embeddings (`tok_emb`):** Hubadilisha viashiria vya token kuwa embeddings. Kama ukumbusho, hizi ni uzito zinazotolewa kwa kila kipimo cha kila token katika msamiati.
|
|
- **Positional Embeddings (`pos_emb`):** Ongeza taarifa za nafasi kwa embeddings ili kukamata mpangilio wa token. Kama ukumbusho, hizi ni uzito zinazotolewa kwa token kulingana na nafasi yake katika maandiko.
|
|
- **Dropout (`drop_emb`):** Inatumika kwa embeddings kwa ajili ya regularisation.
|
|
- **Transformer Blocks (`trf_blocks`):** Kifungu cha `n_layers` transformer blocks ili kushughulikia embeddings.
|
|
- **Final Normalization (`final_norm`):** Kiwango cha normalization kabla ya safu ya matokeo.
|
|
- **Output Layer (`out_head`):** Inatabiri hali za mwisho zilizofichwa kwa ukubwa wa msamiati ili kutoa logits kwa ajili ya utabiri.
|
|
|
|
> [!TIP]
|
|
> Lengo la darasa hili ni kutumia mitandao mingine yote iliyotajwa ili **kutabiri token inayofuata katika mfuatano**, ambayo ni muhimu kwa kazi kama vile uzalishaji wa maandiko.
|
|
>
|
|
> Kumbuka jinsi itakavy **tumia blocks za transformer nyingi kadri zilivyoonyeshwa** na kwamba kila block ya transformer inatumia neti moja ya multi-head attestation, neti moja ya feed forward na normalizations kadhaa. Hivyo kama blocks 12 za transformer zinatumika, ongeza hii kwa 12.
|
|
>
|
|
> Zaidi ya hayo, safu ya **normalization** inaongezwa **kabla** ya **matokeo** na safu ya mwisho ya linear inatumika mwishoni kupata matokeo yenye vipimo sahihi. Kumbuka jinsi kila vector ya mwisho ina ukubwa wa msamiati ulio tumika. Hii ni kwa sababu inajaribu kupata uwezekano kwa kila token inayowezekana ndani ya msamiati.
|
|
|
|
## Idadi ya Vigezo vya kufundisha
|
|
|
|
Baada ya muundo wa GPT kufafanuliwa, inawezekana kubaini idadi ya vigezo vya kufundisha:
|
|
```python
|
|
GPT_CONFIG_124M = {
|
|
"vocab_size": 50257, # Vocabulary size
|
|
"context_length": 1024, # Context length
|
|
"emb_dim": 768, # Embedding dimension
|
|
"n_heads": 12, # Number of attention heads
|
|
"n_layers": 12, # Number of layers
|
|
"drop_rate": 0.1, # Dropout rate
|
|
"qkv_bias": False # Query-Key-Value bias
|
|
}
|
|
|
|
model = GPTModel(GPT_CONFIG_124M)
|
|
total_params = sum(p.numel() for p in model.parameters())
|
|
print(f"Total number of parameters: {total_params:,}")
|
|
# Total number of parameters: 163,009,536
|
|
```
|
|
### **Hatua kwa Hatua Hesabu**
|
|
|
|
#### **1. Tabaka za Kuunganisha: Kuunganisha Tokeni & Kuunganisha Nafasi**
|
|
|
|
- **Tabaka:** `nn.Embedding(vocab_size, emb_dim)`
|
|
- **Vigezo:** `vocab_size * emb_dim`
|
|
```python
|
|
token_embedding_params = 50257 * 768 = 38,597,376
|
|
```
|
|
- **Tabaka:** `nn.Embedding(context_length, emb_dim)`
|
|
- **Vigezo:** `context_length * emb_dim`
|
|
```python
|
|
position_embedding_params = 1024 * 768 = 786,432
|
|
```
|
|
**Jumla ya Vigezo vya Kuunganisha**
|
|
```python
|
|
embedding_params = token_embedding_params + position_embedding_params
|
|
embedding_params = 38,597,376 + 786,432 = 39,383,808
|
|
```
|
|
#### **2. Transformer Blocks**
|
|
|
|
Kuna blocks 12 za transformer, hivyo tutahesabu vigezo kwa block moja kisha kuzidisha kwa 12.
|
|
|
|
**Parameters per Transformer Block**
|
|
|
|
**a. Multi-Head Attention**
|
|
|
|
- **Components:**
|
|
- **Query Linear Layer (`W_query`):** `nn.Linear(emb_dim, emb_dim, bias=False)`
|
|
- **Key Linear Layer (`W_key`):** `nn.Linear(emb_dim, emb_dim, bias=False)`
|
|
- **Value Linear Layer (`W_value`):** `nn.Linear(emb_dim, emb_dim, bias=False)`
|
|
- **Output Projection (`out_proj`):** `nn.Linear(emb_dim, emb_dim)`
|
|
- **Calculations:**
|
|
|
|
- **Kila moja ya `W_query`, `W_key`, `W_value`:**
|
|
|
|
```python
|
|
qkv_params = emb_dim * emb_dim = 768 * 768 = 589,824
|
|
```
|
|
|
|
Kwa kuwa kuna tabaka tatu kama hizo:
|
|
|
|
```python
|
|
total_qkv_params = 3 * qkv_params = 3 * 589,824 = 1,769,472
|
|
```
|
|
|
|
- **Output Projection (`out_proj`):**
|
|
|
|
```python
|
|
out_proj_params = (emb_dim * emb_dim) + emb_dim = (768 * 768) + 768 = 589,824 + 768 = 590,592
|
|
```
|
|
|
|
- **Jumla ya Vigezo vya Multi-Head Attention:**
|
|
|
|
```python
|
|
mha_params = total_qkv_params + out_proj_params
|
|
mha_params = 1,769,472 + 590,592 = 2,360,064
|
|
```
|
|
|
|
**b. FeedForward Network**
|
|
|
|
- **Components:**
|
|
- **First Linear Layer:** `nn.Linear(emb_dim, 4 * emb_dim)`
|
|
- **Second Linear Layer:** `nn.Linear(4 * emb_dim, emb_dim)`
|
|
- **Calculations:**
|
|
|
|
- **First Linear Layer:**
|
|
|
|
```python
|
|
ff_first_layer_params = (emb_dim * 4 * emb_dim) + (4 * emb_dim)
|
|
ff_first_layer_params = (768 * 3072) + 3072 = 2,359,296 + 3,072 = 2,362,368
|
|
```
|
|
|
|
- **Second Linear Layer:**
|
|
|
|
```python
|
|
ff_second_layer_params = (4 * emb_dim * emb_dim) + emb_dim
|
|
ff_second_layer_params = (3072 * 768) + 768 = 2,359,296 + 768 = 2,360,064
|
|
```
|
|
|
|
- **Jumla ya Vigezo vya FeedForward:**
|
|
|
|
```python
|
|
ff_params = ff_first_layer_params + ff_second_layer_params
|
|
ff_params = 2,362,368 + 2,360,064 = 4,722,432
|
|
```
|
|
|
|
**c. Layer Normalizations**
|
|
|
|
- **Components:**
|
|
- Instances mbili za `LayerNorm` kwa block.
|
|
- Kila `LayerNorm` ina vigezo `2 * emb_dim` (scale na shift).
|
|
- **Calculations:**
|
|
|
|
```python
|
|
pythonCopy codelayer_norm_params_per_block = 2 * (2 * emb_dim) = 2 * 768 * 2 = 3,072
|
|
```
|
|
|
|
**d. Jumla ya Vigezo kwa Transformer Block**
|
|
```python
|
|
pythonCopy codeparams_per_block = mha_params + ff_params + layer_norm_params_per_block
|
|
params_per_block = 2,360,064 + 4,722,432 + 3,072 = 7,085,568
|
|
```
|
|
**Jumla ya Vigezo kwa ajili ya Vizui vya Transformer Vyote**
|
|
```python
|
|
pythonCopy codetotal_transformer_blocks_params = params_per_block * n_layers
|
|
total_transformer_blocks_params = 7,085,568 * 12 = 85,026,816
|
|
```
|
|
#### **3. Tabaka la Mwisho**
|
|
|
|
**a. Kurekebisha Tabaka la Mwisho**
|
|
|
|
- **Parameta:** `2 * emb_dim` (kubwa na kuhamasisha)
|
|
```python
|
|
pythonCopy codefinal_layer_norm_params = 2 * 768 = 1,536
|
|
```
|
|
**b. Tabaka la Kutolewa (`out_head`)**
|
|
|
|
- **Tabaka:** `nn.Linear(emb_dim, vocab_size, bias=False)`
|
|
- **Vigezo:** `emb_dim * vocab_size`
|
|
```python
|
|
pythonCopy codeoutput_projection_params = 768 * 50257 = 38,597,376
|
|
```
|
|
#### **4. Kuangalia Mipangilio Yote**
|
|
```python
|
|
pythonCopy codetotal_params = (
|
|
embedding_params +
|
|
total_transformer_blocks_params +
|
|
final_layer_norm_params +
|
|
output_projection_params
|
|
)
|
|
total_params = (
|
|
39,383,808 +
|
|
85,026,816 +
|
|
1,536 +
|
|
38,597,376
|
|
)
|
|
total_params = 163,009,536
|
|
```
|
|
## Generate Text
|
|
|
|
Kuwa na mfano unaotabiri token inayofuata kama ile ya awali, inahitajika tu kuchukua thamani za token za mwisho kutoka kwa matokeo (kama zitakuwa zile za token iliyotabiriwa), ambazo zitakuwa **thamani kwa kila kipengee katika msamiati** na kisha kutumia kazi ya `softmax` kuboresha vipimo kuwa uwezekano vinavyos suma 1 na kisha kupata index ya kipengee kikubwa zaidi, ambacho kitakuwa index ya neno ndani ya msamiati.
|
|
|
|
Code from [https://github.com/rasbt/LLMs-from-scratch/blob/main/ch04/01_main-chapter-code/ch04.ipynb](https://github.com/rasbt/LLMs-from-scratch/blob/main/ch04/01_main-chapter-code/ch04.ipynb):
|
|
```python
|
|
def generate_text_simple(model, idx, max_new_tokens, context_size):
|
|
# idx is (batch, n_tokens) array of indices in the current context
|
|
for _ in range(max_new_tokens):
|
|
|
|
# Crop current context if it exceeds the supported context size
|
|
# E.g., if LLM supports only 5 tokens, and the context size is 10
|
|
# then only the last 5 tokens are used as context
|
|
idx_cond = idx[:, -context_size:]
|
|
|
|
# Get the predictions
|
|
with torch.no_grad():
|
|
logits = model(idx_cond)
|
|
|
|
# Focus only on the last time step
|
|
# (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
|
|
logits = logits[:, -1, :]
|
|
|
|
# Apply softmax to get probabilities
|
|
probas = torch.softmax(logits, dim=-1) # (batch, vocab_size)
|
|
|
|
# Get the idx of the vocab entry with the highest probability value
|
|
idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1)
|
|
|
|
# Append sampled index to the running sequence
|
|
idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1)
|
|
|
|
return idx
|
|
|
|
|
|
start_context = "Hello, I am"
|
|
|
|
encoded = tokenizer.encode(start_context)
|
|
print("encoded:", encoded)
|
|
|
|
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
|
print("encoded_tensor.shape:", encoded_tensor.shape)
|
|
|
|
model.eval() # disable dropout
|
|
|
|
out = generate_text_simple(
|
|
model=model,
|
|
idx=encoded_tensor,
|
|
max_new_tokens=6,
|
|
context_size=GPT_CONFIG_124M["context_length"]
|
|
)
|
|
|
|
print("Output:", out)
|
|
print("Output length:", len(out[0]))
|
|
```
|
|
## Marejeo
|
|
|
|
- [https://www.manning.com/books/build-a-large-language-model-from-scratch](https://www.manning.com/books/build-a-large-language-model-from-scratch)
|
|
|
|
|
|
{{#include ../../banners/hacktricks-training.md}}
|