import torch import torch.nn as nn from transformers import BertTokenizerFast, BertModel # === 1. 标签准备 === labels = ["O", "B-PER", "I-PER"] label2id = {label: i for i, label in enumerate(labels)} id2label = {i: label for label, i in label2id.items()} num_labels = len(labels) # === 2. 输入数据准备 === sentence = ["Hello", "John", "Doe"] ner_tags = ["O", "B-PER", "I-PER"] tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") encoding = tokenizer(sentence, is_split_into_words=True, return_tensors="pt", padding=True, truncation=True) input_ids = encoding["input_ids"] # [1, seq_len] attention_mask = encoding["attention_mask"] # 对齐标签到 subword: word_ids = encoding.word_ids(batch_index=0) label_ids = [] for word_idx in word_ids: if word_idx is None: label_ids.append(-100) # special tokens (CLS, SEP) else: label_ids.append(label2id[ner_tags[word_idx]]) labels_tensor = torch.tensor([label_ids]) # shape: [1, seq_len] # === 3. 模型定义(BERT + Linear)=== class BERTForNER(nn.Module): def __init__(self, model_name, num_labels): super().__init__() self.bert = BertModel.from_pretrained(model_name) self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels) def forward(self, input_ids, attention_mask): outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) logits = self.classifier(outputs.last_hidden_state) # [batch, seq_len, num_labels] return logits # === 4. 初始化模型和损失函数 === model = BERTForNER("bert-base-uncased", num_labels) loss_fn = nn.CrossEntropyLoss(ignore_index=-100) optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) # === 5. 训练步骤(前向 -> loss -> 反向 -> 优化)=== model.train() # 前向传播 logits = model(input_ids=input_ids, attention_mask=attention_mask) # [1, seq_len, num_labels] # reshape logits 和 labels 以适配 CrossEntropyLoss logits = logits.view(-1, num_labels) # [batch * seq_len, num_labels] labels = labels_tensor.view(-1) # [batch * seq_len] loss = loss_fn(logits, labels) print("Loss:", loss.item()) # 反向传播 loss.backward() # 参数更新 optimizer.step() # 清空梯度 optimizer.zero_grad()