流式输出

最后发布时间 : 2025-03-01 19:50:06 浏览量 :

https://blog.csdn.net/weixin_45549737/article/details/137598497
https://pro-chat.antdigital.dev/guide/sse
https://zhuanlan.zhihu.com/p/696271857

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 加载模型和分词器
model_name = "gpt2"  # 可换成更强的模型，比如 "EleutherAI/gpt-neo-2.7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# 输入文本
input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# 逐步生成文本（流式输出）
max_new_tokens = 50
output_ids = input_ids

for _ in range(max_new_tokens):
    with torch.no_grad():
        outputs = model(output_ids)
        next_token = torch.argmax(outputs.logits[:, -1, :], dim=-1).unsqueeze(0)
        output_ids = torch.cat([output_ids, next_token], dim=1)

        # 解码并打印新增 token
        new_text = tokenizer.decode(next_token[0])
        print(new_text, end="", flush=True)

from transformers import TextStreamer

streamer = TextStreamer(tokenizer)  # 创建流式解码器

input_text = "Once upon a time"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# 直接使用 streamer 进行流式生成
model.generate(input_ids, max_new_tokens=50, streamer=streamer)

import torch

# Imports from a local file
from supplementary import GPTModel


model = GPTModel(GPT_CONFIG_124M)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load("model.pth", map_location=device)
model.load_state_dict(state_dict)
model.eval();

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

device = torch.device( "cpu")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer).to(device),
    max_new_tokens=30,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

idx=text_to_token_ids(start_context, tokenizer).to(device)
for _ in range(10):
   with torch.no_grad():

      # Crop current context if it exceeds the supported context size
      # E.g., if LLM supports only 5 tokens, and the context size is 10
      # then only the last 5 tokens are used as context
      idx_cond = idx[:, -GPT_CONFIG_124M["context_length"]:]

      # Get the predictions
      with torch.no_grad():
          logits = model(idx_cond)

      # Focus only on the last time step
      # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
      logits = logits[:, -1, :]

      # Apply softmax to get probabilities
      probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

      # Get the idx of the vocab entry with the highest probability value
      idx_next = torch.argmax(probas, dim=-1).unsqueeze(0)  # (batch, 1)

      # Append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)
      # print(token_ids_to_text(idx, tokenizer))
      # print()
      new_text = token_ids_to_text(idx_next, tokenizer)
      print(new_text, end="", flush=True)

from litgpt import LLM
llm = LLM.load("microsoft/phi-2")
llm.generate("What do Llamas eat?")
result = llm.generate("What do Llamas eat?", stream=True, max_new_tokens=200)
for e in result:
    print(e, end="", flush=True)

如何理解llm的并发? OpenAI api使用介绍