上一节,我们演示了输入一段文字,模型使用初始随机参数输出一段文字
import tiktoken tokenizer = tiktoken.get_encoding("gpt2") model = GPTModel(GPT_CONFIG_124M) encoded = tokenizer.encode(start_context) encoded_tensor = torch.tensor(encoded).unsqueeze(0) out = generate_text_simple( model=model, idx=encoded_tensor, max_new_tokens=6, context_size=1024 ) decoded_text = tokenizer.decode(out.squeeze(0).tolist())
本节我们将讨论如何训练模型:
with open("the-verdict.txt", "r", encoding="utf-8") as file: text_data = file.read() from supplementary import create_dataloader_v1 # Train/validation ratio train_ratio = 0.90 split_idx = int(train_ratio * len(text_data)) train_data = text_data[:split_idx] val_data = text_data[split_idx:] torch.manual_seed(123) train_loader = create_dataloader_v1( train_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=True, shuffle=True, num_workers=0 ) val_loader = create_dataloader_v1( val_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=False, shuffle=False, num_workers=0 )
在开始训练前计算一下初始损失
from supplementary import calc_loss_loader device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet train_loss = calc_loss_loader(train_loader, model, device) val_loss = calc_loss_loader(val_loader, model, device) print("Training loss:", train_loss) print("Validation loss:", val_loss)
from supplementary import ( calc_loss_batch, evaluate_model, generate_and_print_sample ) def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer): # Initialize lists to track losses and tokens seen train_losses, val_losses, track_tokens_seen = [], [], [] tokens_seen, global_step = 0, -1 # Main training loop for epoch in range(num_epochs): model.train() # Set model to training mode for input_batch, target_batch in train_loader: optimizer.zero_grad() # Reset loss gradients from previous batch iteration loss = calc_loss_batch(input_batch, target_batch, model, device) loss.backward() # Calculate loss gradients optimizer.step() # Update model weights using loss gradients tokens_seen += input_batch.numel() global_step += 1 # Optional evaluation step if global_step % eval_freq == 0: train_loss, val_loss = evaluate_model( model, train_loader, val_loader, device, eval_iter) train_losses.append(train_loss) val_losses.append(val_loss) track_tokens_seen.append(tokens_seen) print(f"Ep {epoch+1} (Step {global_step:06d}): " f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}") # Print a sample text after each epoch generate_and_print_sample( model, tokenizer, device, start_context ) return train_losses, val_losses, track_tokens_seen
torch.manual_seed(123) model = GPTModel(GPT_CONFIG_124M) model.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1) num_epochs = 10 train_losses, val_losses, tokens_seen = train_model_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=5, eval_iter=5, start_context="Every effort moves you", tokenizer=tokenizer )
保存模型
torch.save(model.state_dict(), "model.pth")
可视化训练结果
from supplementary import plot_losses epochs_tensor = torch.linspace(0, num_epochs, len(train_losses)) plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)
观察上述结果,我们可以看到,模型一开始生成的是无法理解的单词组合,而到最后,它能够生成基本符合语法的句子。
但是,从训练集和验证集的损失来看,我们可以看到模型开始出现过拟合。
如果我们检查它在后期生成的一些文本段落,会发现它们与训练集中内容完全相同——也就是说,模型只是简单地记住了训练数据。
有一些解码策略(本次研讨会未涉及)可以在一定程度上缓解这种记忆化现象。
还要注意,这里的过拟合现象是由于我们的训练集非常小,并且进行了过多次迭代。
import torch # Imports from a local file from supplementary import GPTModel model = GPTModel(GPT_CONFIG_124M) # device = torch.device("cuda") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") state_dict = torch.load("model.pth", map_location=device) model.load_state_dict(state_dict) model.eval(); start_context = "Every effort moves you" tokenizer = tiktoken.get_encoding("gpt2") token_ids = generate_text_simple( model=model, idx=text_to_token_ids(start_context, tokenizer), max_new_tokens=30, context_size=GPT_CONFIG_124M["context_length"] ) print("Output text:\n", token_ids_to_text(token_ids, tokenizer))