def forward(self, src, tgt): encoded_src = self.encoder(src) decoded_tgt = self.decoder(tgt, encoded_src) output = self.fc(decoded_tgt) return output
import torch import torch.nn as nn import torch.nn.functional as F class RMSNorm(nn.Module): def __init__(self, dim, eps=1e-6): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(dim)) def forward(self, x): variance = x.pow(2).mean(-1, keepdim=True) return x * torch.rsqrt(variance + self.eps) * self.weight class CausalSelfAttention(nn.Module): def __init__(self, config): super().__init__() self.n_head = config.n_head self.n_embd = config.n_embd # Key, query, value projections self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False) # Output projection self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False) def forward(self, x): B, T, C = x.size() q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # Reshape for multi-head attention k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # Causal attention mask injection att = (q @ k.transpose(-2, -1)) * (1.0 / (k.size(-1) ** 0.5)) mask = torch.tril(torch.ones(T, T, device=x.device)).view(1, 1, T, T) att = att.masked_fill(mask == 0, float('-inf')) att = F.softmax(att, dim=-1) y = att @ v y = y.transpose(1, 2).contiguous().view(B, T, C) return self.c_proj(y) class TransformerBlock(nn.Module): def __init__(self, config): super().__init__() self.ln_1 = RMSNorm(config.n_embd) self.attn = CausalSelfAttention(config) self.ln_2 = RMSNorm(config.n_embd) self.mlp = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd, bias=False), nn.SiLU(), # Used for SwiGLU-style variants nn.Linear(4 * config.n_embd, config.n_embd, bias=False) ) def forward(self, x): x = x + self.attn(self.ln_1(x)) x = x + self.mlp(self.ln_2(x)) return x Use code with caution. 4. The Pre-training Phase build a large language model %28from scratch%29 pdf
↓ Focus on [ ] Fine-Tuning open-source models (e.g., Llama, Falcon) def forward(self, src, tgt): encoded_src = self
Building a Large Language Model (LLM) from scratch is one of the most effective ways to understand the "black box" of modern generative AI. Rather than just calling an API, constructing your own model allows you to master the intricate mechanics of data processing, attention mechanisms, and architectural scaling. Rather than just calling an API, constructing your