配置一个124 million参数的GPT2模型
124 million
GPT_CONFIG_124M = { "vocab_size": 50257, # Vocabulary size "context_length": 1024, # Context length "emb_dim": 768, # Embedding dimension "n_heads": 12, # Number of attention heads "n_layers": 12, # Number of layers "drop_rate": 0.0, # Dropout rate "qkv_bias": False # Query-Key-Value bias }
import torch.nn as nn from supplementary import TransformerBlock, LayerNorm class GPTModel(nn.Module): def __init__(self, cfg): super().__init__() self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) self.drop_emb = nn.Dropout(cfg["drop_rate"]) self.trf_blocks = nn.Sequential( *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) self.final_norm = LayerNorm(cfg["emb_dim"]) self.out_head = nn.Linear( cfg["emb_dim"], cfg["vocab_size"], bias=False ) def forward(self, in_idx): batch_size, seq_len = in_idx.shape tok_embeds = self.tok_emb(in_idx) pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device)) x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size] x = self.drop_emb(x) x = self.trf_blocks(x) x = self.final_norm(x) logits = self.out_head(x) return logits
def generate_text_simple(model, idx, max_new_tokens, context_size): # idx is (batch, n_tokens) array of indices in the current context for _ in range(max_new_tokens): # Crop current context if it exceeds the supported context size # E.g., if LLM supports only 5 tokens, and the context size is 10 # then only the last 5 tokens are used as context idx_cond = idx[:, -context_size:] # Get the predictions with torch.no_grad(): logits = model(idx_cond) # Focus only on the last time step # (batch, n_tokens, vocab_size) becomes (batch, vocab_size) logits = logits[:, -1, :] # Apply softmax to get probabilities probas = torch.softmax(logits, dim=-1) # (batch, vocab_size) # Get the idx of the vocab entry with the highest probability value idx_next = torch.argmax(probas, dim=-1, keepdim=True) # (batch, 1) # Append sampled index to the running sequence idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) return idx
import torch import tiktoken tokenizer = tiktoken.get_encoding("gpt2") torch.manual_seed(123) model = GPTModel(GPT_CONFIG_124M) model.eval(); # disable dropout start_context = "Hello, I am" encoded = tokenizer.encode(start_context) print("encoded:", encoded) encoded_tensor = torch.tensor(encoded).unsqueeze(0) print("encoded_tensor.shape:", encoded_tensor.shape) out = generate_text_simple( model=model, idx=encoded_tensor, max_new_tokens=6, context_size=GPT_CONFIG_124M["context_length"] ) print("Output:", out) print("Output length:", len(out[0])) decoded_text = tokenizer.decode(out.squeeze(0).tolist()) print(decoded_text)