https://blog.csdn.net/weixin_45549737/article/details/137598497https://pro-chat.antdigital.dev/guide/ssehttps://zhuanlan.zhihu.com/p/696271857
import torch from transformers import AutoModelForCausalLM, AutoTokenizer # 加载模型和分词器 model_name = "gpt2" # 可换成更强的模型,比如 "EleutherAI/gpt-neo-2.7B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # 输入文本 input_text = "Once upon a time" input_ids = tokenizer.encode(input_text, return_tensors="pt") # 逐步生成文本(流式输出) max_new_tokens = 50 output_ids = input_ids for _ in range(max_new_tokens): with torch.no_grad(): outputs = model(output_ids) next_token = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(0) output_ids = torch.cat([output_ids, next_token], dim=1) # 解码并打印新增 token new_text = tokenizer.decode(next_token[0]) print(new_text, end="", flush=True)
from transformers import TextStreamer streamer = TextStreamer(tokenizer) # 创建流式解码器 input_text = "Once upon a time" input_ids = tokenizer.encode(input_text, return_tensors="pt") # 直接使用 streamer 进行流式生成 model.generate(input_ids, max_new_tokens=50, streamer=streamer)
import torch # Imports from a local file from supplementary import GPTModel model = GPTModel(GPT_CONFIG_124M) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") state_dict = torch.load("model.pth", map_location=device) model.load_state_dict(state_dict) model.eval(); start_context = "Every effort moves you" tokenizer = tiktoken.get_encoding("gpt2") device = torch.device( "cpu") token_ids = generate_text_simple( model=model, idx=text_to_token_ids(start_context, tokenizer).to(device), max_new_tokens=30, context_size=GPT_CONFIG_124M["context_length"] ) print("Output text:\n", token_ids_to_text(token_ids, tokenizer))
idx=text_to_token_ids(start_context, tokenizer).to(device) for _ in range(10): with torch.no_grad(): # Crop current context if it exceeds the supported context size # E.g., if LLM supports only 5 tokens, and the context size is 10 # then only the last 5 tokens are used as context idx_cond = idx[:, -GPT_CONFIG_124M["context_length"]:] # Get the predictions with torch.no_grad(): logits = model(idx_cond) # Focus only on the last time step # (batch, n_tokens, vocab_size) becomes (batch, vocab_size) logits = logits[:, -1, :] # Apply softmax to get probabilities probas = torch.softmax(logits, dim=-1) # (batch, vocab_size) # Get the idx of the vocab entry with the highest probability value idx_next = torch.argmax(probas, dim=-1).unsqueeze(0) # (batch, 1) # Append sampled index to the running sequence idx = torch.cat((idx, idx_next), dim=1) # (batch, n_tokens+1) # print(token_ids_to_text(idx, tokenizer)) # print() new_text = token_ids_to_text(idx_next, tokenizer) print(new_text, end="", flush=True)
from litgpt import LLM llm = LLM.load("microsoft/phi-2") llm.generate("What do Llamas eat?") result = llm.generate("What do Llamas eat?", stream=True, max_new_tokens=200) for e in result: print(e, end="", flush=True)