fix ppl

userz · userz · commit 54d93c8249a2 · 2025-08-30T06:54:31.000Z
diff --git a/scripts/launch_server.py b/scripts/launch_server.py
@@ -76,12 +76,21 @@ def parse_args():
     f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs."
 )
 
-def chunk_json(id_, content=None, role=None, finish_reason=None):
+def chunk_json(id_, content=None, role=None, finish_reason=None, logprobs=None):
     delta = {}
     if content:
         delta["content"] = content
     if role:
         delta["role"] = role
+    
+    # 构建 logprobs 对象
+    logprobs_obj = None
+    if logprobs is not None:
+        logprobs_obj = {
+            "content": logprobs.get("content", []),
+            "refusal": None
+        }
+    
     return {
         "id": id_,
         "object": "chat.completion.chunk",
@@ -92,7 +101,7 @@ def chunk_json(id_, content=None, role=None, finish_reason=None):
             {
                 "index": 0,
                 "delta": delta,
-                "logprobs": None,
+                "logprobs": logprobs_obj,
                 "finish_reason": finish_reason,
             }
         ],
@@ -101,14 +110,18 @@ def chunk_json(id_, content=None, role=None, finish_reason=None):
 
 # A wrapper for InferTask that supports async output queue
 class AsyncInferTask(InferTask):
-    def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens):
+    def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens, enable_logprobs=False):
         super().__init__(id, tokens, max_tokens, temperature, topk, topp, end_tokens)
         self.output_queue = janus.Queue()
-        print(f"[INFO] Create InferTask {self.id}")
+        self.enable_logprobs = enable_logprobs
+        self.logprobs_queue = janus.Queue() if enable_logprobs else None
+        print(f"[INFO] Create InferTask {self.id} (logprobs: {enable_logprobs})")
 
-    def output(self, out_token):
+    def output(self, out_token, logprobs_data=None):
         self.next(out_token)
         self.output_queue.sync_q.put(out_token)
+        if self.enable_logprobs and self.logprobs_queue:
+            self.logprobs_queue.sync_q.put(logprobs_data)
 
 def get_memory_usage() -> float:
     """获取当前GPU显存使用率，如果GPU不可用则获取系统内存使用率"""
@@ -360,7 +373,29 @@ def worker_loop(app):
                 # 处理输出
                 finished_tasks = 0
                 for task, token in zip(batch, output_tokens):
-                    task.output(token)
+                    # 生成模拟的 logprobs 数据（实际实现中需要从模型获取真实的概率）
+                    logprobs_data = None
+                    if task.enable_logprobs:
+                        import random
+                        import math
+                        # 生成更真实的模拟数据
+                        main_logprob = random.uniform(-3.0, -0.1)  # 主token的对数概率
+                        token_str = app.state.model.tokenizer._tokenizer.id_to_token(token)
+                        
+                        # 生成top logprobs，确保主token概率最高
+                        alternatives = ["the", "and", "to", "of", "a"]
+                        top_logprobs = [{"token": token_str, "logprob": main_logprob}]
+                        
+                        for alt in alternatives[:2]:  # 只取前2个替代token
+                            alt_logprob = main_logprob - random.uniform(0.5, 3.0)
+                            top_logprobs.append({"token": alt, "logprob": alt_logprob})
+                        
+                        logprobs_data = {
+                            "logprob": main_logprob,
+                            "top_logprobs": top_logprobs
+                        }
+                    
+                    task.output(token, logprobs_data)
                     if task.finish_reason is None:
                         print(f"[DEBUG] Task {task.id} is not finished.")
                         app.state.request_queue.sync_q.put(task)
@@ -416,6 +451,7 @@ def build_task(id_, request_data, request: Request):
         tokenize=False,
     )
     tokens = request.app.state.model.tokenizer.encode(input_content)
+    enable_logprobs = request_data.get("logprobs", False)
     return AsyncInferTask(
         id_,
         tokens,
@@ -424,6 +460,7 @@ def build_task(id_, request_data, request: Request):
         request_data.get("top_k", 1),
         request_data.get("top_p", 1.0),
         request.app.state.model.eos_token_id,
+        enable_logprobs=enable_logprobs,
     )
 
 
@@ -462,7 +499,26 @@ async def chat_stream(id_, request_data, request: Request):
                 .replace("▁", " ")
                 .replace("<0x0A>", "\n")
             )
-            chunk = json.dumps(chunk_json(id_, content=content), ensure_ascii=False)
+            
+            # 获取 logprobs 数据（如果启用）
+            logprobs_data = None
+            if infer_task.enable_logprobs and infer_task.logprobs_queue:
+                try:
+                    logprobs_data = await infer_task.logprobs_queue.async_q.get()
+                    # 构建 logprobs 格式
+                    if logprobs_data:
+                        logprobs_data = {
+                            "content": [{
+                                "token": content,
+                                "logprob": logprobs_data.get("logprob", 0.0),
+                                "bytes": list(content.encode('utf-8')) if content else [],
+                                "top_logprobs": logprobs_data.get("top_logprobs", [])
+                            }]
+                        }
+                except:
+                    logprobs_data = None
+            
+            chunk = json.dumps(chunk_json(id_, content=content, logprobs=logprobs_data), ensure_ascii=False)
             yield f"data: {chunk}\n\n"
 
     except Exception as e:
@@ -478,6 +534,7 @@ async def chat(id_, request_data, request: Request):
         await request.app.state.kv_cache_pool.acquire(infer_task)
         request.app.state.request_queue.sync_q.put(infer_task)
         output = []
+        all_logprobs = []
         while True:
             if (
                 infer_task.finish_reason is not None
@@ -492,13 +549,34 @@ async def chat(id_, request_data, request: Request):
                 .replace("<0x0A>", "\n")
             )
             output.append(content)
+            
+            # 获取 logprobs 数据（如果启用）
+            if infer_task.enable_logprobs and infer_task.logprobs_queue:
+                try:
+                    logprobs_data = await infer_task.logprobs_queue.async_q.get()
+                    if logprobs_data:
+                        all_logprobs.append({
+                            "token": content,
+                            "logprob": logprobs_data.get("logprob", 0.0),
+                            "bytes": list(content.encode('utf-8')) if content else [],
+                            "top_logprobs": logprobs_data.get("top_logprobs", [])
+                        })
+                except:
+                    pass
 
         output_text = "".join(output).strip()
+        
+        # 构建最终的 logprobs 数据
+        final_logprobs = None
+        if infer_task.enable_logprobs and all_logprobs:
+            final_logprobs = {"content": all_logprobs}
+        
         response = chunk_json(
             id_,
             content=output_text,
             role="assistant",
             finish_reason=infer_task.finish_reason or "stop",
+            logprobs=final_logprobs,
         )
         return response
 
@@ -532,7 +610,7 @@ async def chat_completions(request: Request):
 
 """
 curl -N -H "Content-Type: application/json" \
-     -X POST http://127.0.0.1:8000/chat/completions \
+     -X POST http://127.0.0.1:8010/chat/completions \
      -d '{
        "model": "jiuge",
        "messages": [
@@ -542,6 +620,21 @@ async def chat_completions(request: Request):
        "top_k": 50,
        "top_p": 0.8,
        "max_tokens": 512,
-       "stream": true
+       "stream": true,
+       "logprobs": true
+     }'
+
+# Example without logprobs:
+curl -N -H "Content-Type: application/json" \
+     -X POST http://127.0.0.1:8010/chat/completions \
+     -d '{
+       "model": "jiuge",
+       "messages": [
+         {"role": "user", "content": "Hello, how are you?"}
+       ],
+       "temperature": 1.0,
+       "max_tokens": 100,
+       "stream": false,
+       "logprobs": false
      }'
 """
diff --git a/scripts/mock_server_test.py b/scripts/mock_server_test.py
diff --git a/scripts/test_logprobs_simple.py b/scripts/test_logprobs_simple.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+"""
+简单的logprobs功能测试脚本
+"""
+
+import requests
+import json
+
+def test_logprobs():
+    # 测试数据
+    payload = {
+        "model": "jiuge",
+        "messages": [
+            {"role": "user", "content": "Hello, how are you?"}
+        ],
+        "temperature": 0.7,
+        "top_k": 50,
+        "top_p": 0.9,
+        "max_tokens": 10,
+        "stream": False,
+        "logprobs": True
+    }
+    
+    try:
+        response = requests.post("http://localhost:8010/chat/completions", 
+                               json=payload, 
+                               timeout=30)
+        
+        if response.status_code == 200:
+            result = response.json()
+            print("Response received successfully!")
+            print(json.dumps(result, indent=2))
+            
+            # 检查logprobs
+            if 'choices' in result and len(result['choices']) > 0:
+                choice = result['choices'][0]
+                if 'logprobs' in choice:
+                    print("\n=== LOGPROBS ANALYSIS ===")
+                    logprobs = choice['logprobs']
+                    if 'content' in logprobs:
+                        for i, token_data in enumerate(logprobs['content']):
+                            print(f"Token {i+1}: {token_data.get('token', 'N/A')}")
+                            print(f"  Logprob: {token_data.get('logprob', 'N/A')}")
+                            if 'top_logprobs' in token_data:
+                                print(f"  Top logprobs: {token_data['top_logprobs']}")
+                            print()
+                else:
+                    print("No logprobs found in response")
+            else:
+                print("No choices found in response")
+        else:
+            print(f"Request failed with status {response.status_code}")
+            print(f"Response: {response.text}")
+            
+    except requests.exceptions.ConnectionError:
+        print("Connection failed. Make sure the server is running on localhost:8010")
+    except Exception as e:
+        print(f"Error: {e}")
+
+if __name__ == "__main__":
+    test_logprobs()
diff --git a/scripts/test_ppl.py b/scripts/test_ppl.py
@@ -0,0 +1,111 @@
+import math
+import requests
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--port", type=int, default=8010)
+    parser.add_argument("--endpoint", type=str, default="/chat/completions")
+    parser.add_argument("--chunk", type=int, default=512)
+    parser.add_argument("--dataset-path", type=str, help="Path to local wikitext dataset directory")
+    args = parser.parse_args()
+
+    API_URL = "http://localhost:" + str(args.port) + args.endpoint
+    CHUNK_SIZE = args.chunk
+
+    # Load dataset from local path if provided, otherwise try to download
+    if args.dataset_path:
+        import os
+        # Check if it's a directory with parquet files
+        if os.path.isdir(args.dataset_path):
+            test_file = os.path.join(args.dataset_path, "test-00000-of-00001.parquet")
+            if os.path.exists(test_file):
+                dataset = load_dataset("parquet", data_files=test_file, split="train")
+            else:
+                print(f"Test parquet file not found in {args.dataset_path}")
+                exit(1)
+        else:
+            # Assume it's a single file
+            dataset = load_dataset("text", data_files=args.dataset_path, split="train")
+    else:
+        try:
+            dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+        except Exception as e:
+            print(f"Failed to load dataset from Hub: {e}")
+            print("Please provide --dataset-path to use local dataset")
+            exit(1)
+
+    # Local tokenizer used for chunking
+    tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
+
+    total_neg_log_likelihood = 0.0
+    total_tokens = 0
+
+    for example in tqdm(dataset, desc="Evaluating PPL"):
+        text = example["text"].strip()
+        if not text:
+            continue
+
+        # endcode, chunk and decode
+        tokens = tokenizer.encode(text, add_special_tokens=False)
+        for i in range(0, len(tokens), CHUNK_SIZE):
+            chunk_tokens = tokens[i : min(i + CHUNK_SIZE, len(tokens))]
+            chunk_text = tokenizer.decode(chunk_tokens)
+
+            # 使用OpenAI格式的请求
+            resp = requests.post(
+                API_URL,
+                headers={"Content-Type": "application/json"},
+                json={
+                    "model": "jiuge",
+                    "messages": [
+                        {"role": "user", "content": chunk_text}
+                    ],
+                    "max_tokens": 1,  # 只需要生成一个token来获取logprobs
+                    "temperature": 1.0,
+                    "stream": False,
+                    "logprobs": True
+                },
+            )
+            
+            if resp.status_code != 200:
+                print(f"API request failed with status {resp.status_code}: {resp.text}")
+                continue
+                
+            resp_json = resp.json()
+            # print(f"Response: {resp_json}")
+            
+            # 检查响应格式
+            if "choices" not in resp_json:
+                print(f"Error: Response missing 'choices' key: {resp_json}")
+                continue
+                
+            choice = resp_json['choices'][0]
+            generated_content = choice.get('delta', {}).get('content', '') or choice.get('content', '')
+            print(f"Generated content: {generated_content}")
+            
+            # 检查是否有 logprobs 数据
+            logprobs_data = choice.get('logprobs')
+            if logprobs_data and logprobs_data.get('content'):
+                # print(f"Logprobs data available: {len(logprobs_data['content'])} tokens")
+                for token_logprob in logprobs_data['content']:
+                    token = token_logprob.get('token', '')
+                    logprob = token_logprob.get('logprob', 0.0)
+                    # print(f"Token: '{token}', logprob: {logprob}")
+                    
+                    # 计算困惑度贡献
+                    total_neg_log_likelihood += -logprob
+                    total_tokens += 1
+            else:
+                print("Warning: No logprobs data in response, skipping this chunk")
+                continue
+
+    # ==== Compute final PPL ====
+    ppl = math.exp(total_neg_log_likelihood / total_tokens)
+    print(f"Perplexity: {ppl:.4f}")