Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

17.7 A/B 测试与回归测试自动化

本节目标:掌握 Agent 的 A/B 测试与回归测试方法,能够构建自动化的 Prompt 变体测试框架,并将评估集成到 CI/CD 流水线中。


为什么 Agent 需要 A/B 测试?

当你修改了一个 Prompt、调整了工具描述、或者换了模型版本——你怎么知道改得更好了?

传统软件有单元测试,改了代码跑一下就知道了。但 Agent 的输出不确定,"跑一下"无法给出统计意义上可靠的结论。我们需要严格的 A/B 测试来验证每次变更的效果。

常见场景

变更类型风险A/B 测试的价值
修改系统 Prompt可能改善某些场景但破坏其他场景量化对比不同 Prompt 的效果
更换模型版本新模型可能退步在真实任务上对比新旧模型
调整工具描述可能导致工具误用检测工具选择准确率变化
增加新工具可能干扰原有工具调用评估对已有能力的影响
修改 Temperature影响输出多样性和质量权衡创造性和稳定性

A/B 测试框架设计

核心概念

A/B 测试的核心思想:在相同的测试集上,让两个版本的 Agent 分别执行,然后统计对比它们的性能差异。

"""
Agent A/B 测试框架
支持:Prompt 变体对比、统计显著性检验、自动化报告
"""
import json
import time
import hashlib
from dataclasses import dataclass, field
from typing import Optional, Callable
from enum import Enum

from langchain_openai import ChatOpenAI
from scipy import stats
import numpy as np


class TestStatus(Enum):
    """测试状态"""
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"


@dataclass
class TestCase:
    """测试用例"""
    id: str
    query: str                           # 用户输入
    expected_output: Optional[str] = None # 期望输出(可选)
    expected_tools: Optional[list[str]] = None  # 期望调用的工具
    category: str = "default"            # 分类标签
    difficulty: str = "medium"           # easy / medium / hard
    metadata: dict = field(default_factory=dict)


@dataclass
class TestRun:
    """单次测试运行结果"""
    test_case_id: str
    variant: str              # "A" 或 "B"
    output: str               # Agent 输出
    tools_called: list[str] = field(default_factory=list)  # 调用的工具
    steps: int = 0            # 执行步骤数
    tokens: int = 0           # Token 使用量
    latency: float = 0.0      # 响应时间(秒)
    error: Optional[str] = None  # 错误信息
    judge_score: Optional[float] = None  # Judge 评分


@dataclass
class ABTestConfig:
    """A/B 测试配置"""
    name: str
    description: str = ""
    # 变体 A(对照组)
    variant_a_config: dict = field(default_factory=dict)
    # 变体 B(实验组)
    variant_b_config: dict = field(default_factory=dict)
    # 测试参数
    confidence_level: float = 0.95      # 置信水平
    min_sample_size: int = 30           # 最小样本量
    max_sample_size: int = 200          # 最大样本量
    metrics: list[str] = field(default_factory=lambda: [
        "quality_score", "tool_accuracy", "latency", "token_usage"
    ])

A/B 测试框架实现

class ABTestFramework:
    """Agent A/B 测试框架"""

    def __init__(
        self,
        agent_factory_a: Callable,
        agent_factory_b: Callable,
        judge_model: str = "gpt-4.1"
    ):
        """
        Args:
            agent_factory_a: 创建变体 A Agent 的工厂函数
            agent_factory_b: 创建变体 B Agent 的工厂函数
            judge_model: 用于评估的 Judge 模型
        """
        self.agent_factory_a = agent_factory_a
        self.agent_factory_b = agent_factory_b
        self.judge_llm = ChatOpenAI(model=judge_model, temperature=0)

    def run_test(
        self,
        test_cases: list[TestCase],
        config: ABTestConfig,
        progress_callback: Callable = None
    ) -> dict:
        """运行完整的 A/B 测试"""

        results_a: list[TestRun] = []
        results_b: list[TestRun] = []

        total = len(test_cases)

        for i, case in enumerate(test_cases):
            if progress_callback:
                progress_callback(i, total, case.id)

            # 运行变体 A
            run_a = self._run_single(case, self.agent_factory_a, "A")
            results_a.append(run_a)

            # 运行变体 B
            run_b = self._run_single(case, self.agent_factory_b, "B")
            results_b.append(run_b)

            # 用 Judge 评估两个输出
            if case.expected_output or True:  # 始终用 Judge 评估
                self._judge_outputs(case, run_a, run_b)

        # 统计分析
        analysis = self._analyze_results(results_a, results_b, config)

        return {
            "config": {
                "name": config.name,
                "variant_a": config.variant_a_config,
                "variant_b": config.variant_b_config,
                "sample_size": len(test_cases)
            },
            "results_a": results_a,
            "results_b": results_b,
            "analysis": analysis
        }

    def _run_single(
        self,
        case: TestCase,
        agent_factory: Callable,
        variant: str
    ) -> TestRun:
        """运行单次测试"""
        start_time = time.time()
        try:
            agent = agent_factory()
            result = agent.invoke(case.query)

            # 提取结果信息
            output = ""
            tools_called = []
            steps = 0
            tokens = 0

            if isinstance(result, dict):
                output = result.get("output", str(result))
                tools_called = result.get("tools_called", [])
                steps = result.get("steps", 1)
                tokens = result.get("tokens", 0)
            else:
                output = str(result)

            latency = time.time() - start_time

            return TestRun(
                test_case_id=case.id,
                variant=variant,
                output=output,
                tools_called=tools_called,
                steps=steps,
                tokens=tokens,
                latency=latency
            )

        except Exception as e:
            return TestRun(
                test_case_id=case.id,
                variant=variant,
                output="",
                latency=time.time() - start_time,
                error=str(e)
            )

    def _judge_outputs(
        self,
        case: TestCase,
        run_a: TestRun,
        run_b: TestRun
    ):
        """用 LLM Judge 评估两个输出"""

        # 检查工具调用准确性
        if case.expected_tools:
            run_a_tools = set(run_a.tools_called)
            run_b_tools = set(run_b.tools_called)
            expected = set(case.expected_tools)

            # 计算工具准确率(Jaccard 相似度)
            if expected:
                run_a.judge_score = len(run_a_tools & expected) / len(expected)
                run_b.judge_score = len(run_b_tools & expected) / len(expected)
                return

        # 用 LLM Judge 评估输出质量
        prompt = f"""你是一个专业的 AI 输出质量评审员。请评估以下两个 Agent 回答的质量。

用户问题:{case.query}
{'期望输出:' + case.expected_output if case.expected_output else ''}

回答 A:
{run_a.output}

回答 B:
{run_b.output}

请分别对两个回答打分(0-10),以 JSON 格式回复:
{{
    "score_a": <0-10>,
    "score_b": <0-10>,
    "reasoning": "简要说明评分理由"
}}"""

        response = self.judge_llm.invoke(prompt)
        try:
            result = json.loads(response.content)
            run_a.judge_score = result.get("score_a", 0) / 10.0
            run_b.judge_score = result.get("score_b", 0) / 10.0
        except json.JSONDecodeError:
            run_a.judge_score = 0.5
            run_b.judge_score = 0.5

    def _analyze_results(
        self,
        results_a: list[TestRun],
        results_b: list[TestRun],
        config: ABTestConfig
    ) -> dict:
        """统计分析测试结果"""

        # 提取各指标数据
        scores_a = [r.judge_score for r in results_a if r.judge_score is not None]
        scores_b = [r.judge_score for r in results_b if r.judge_score is not None]

        latencies_a = [r.latency for r in results_a if r.error is None]
        latencies_b = [r.latency for r in results_b if r.error is None]

        tokens_a = [r.tokens for r in results_a if r.error is None]
        tokens_b = [r.tokens for r in results_b if r.error is None]

        errors_a = sum(1 for r in results_a if r.error is not None)
        errors_b = sum(1 for r in results_b if r.error is not None)

        analysis = {
            "quality": self._compare_groups(scores_a, scores_b, config),
            "latency": self._compare_groups(latencies_a, latencies_b, config),
            "token_usage": self._compare_groups(tokens_a, tokens_b, config),
            "error_rates": {
                "variant_a": errors_a / len(results_a) if results_a else 0,
                "variant_b": errors_b / len(results_b) if results_b else 0,
            },
            "summary": {
                "variant_a_avg_score": np.mean(scores_a) if scores_a else 0,
                "variant_b_avg_score": np.mean(scores_b) if scores_b else 0,
                "winner": self._determine_winner(scores_a, scores_b)
            }
        }

        return analysis

    def _compare_groups(
        self,
        group_a: list[float],
        group_b: list[float],
        config: ABTestConfig
    ) -> dict:
        """对比两组数据的统计显著性"""
        if len(group_a) < 2 or len(group_b) < 2:
            return {
                "significant": False,
                "p_value": None,
                "note": "样本量不足"
            }

        # 使用 Welch t-test(不假设等方差)
        t_stat, p_value = stats.ttest_ind(group_a, group_b, equal_var=False)

        # Cohen's d 效应量
        pooled_std = np.sqrt(
            (np.std(group_a, ddof=1)**2 + np.std(group_b, ddof=1)**2) / 2
        )
        cohens_d = (
            (np.mean(group_a) - np.mean(group_b)) / pooled_std
            if pooled_std > 0 else 0
        )

        alpha = 1 - config.confidence_level

        return {
            "mean_a": float(np.mean(group_a)),
            "mean_b": float(np.mean(group_b)),
            "std_a": float(np.std(group_a, ddof=1)),
            "std_b": float(np.std(group_b, ddof=1)),
            "t_statistic": float(t_stat),
            "p_value": float(p_value),
            "significant": p_value < alpha,
            "cohens_d": float(cohens_d),
            "effect_size": (
                "大" if abs(cohens_d) >= 0.8
                else "中" if abs(cohens_d) >= 0.5
                else "小" if abs(cohens_d) >= 0.2
                else "可忽略"
            )
        }

    @staticmethod
    def _determine_winner(
        scores_a: list[float],
        scores_b: list[float]
    ) -> str:
        """判定获胜变体"""
        if not scores_a or not scores_b:
            return "无法判定"
        avg_a = np.mean(scores_a)
        avg_b = np.mean(scores_b)
        diff = abs(avg_a - avg_b)
        if diff < 0.05:
            return "平局(差异不显著)"
        return "A" if avg_a > avg_b else "B"

使用示例:对比两个 Prompt 变体

from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# 定义两个 Prompt 变体
PROMPT_A = """你是一个客服助手。请回答用户的问题。要简洁明了。"""

PROMPT_B = """你是一个专业的客服助手。回答用户问题时请遵循以下原则:
1. 先确认理解了用户的问题
2. 给出准确、完整的回答
3. 如果涉及操作步骤,请按编号列出
4. 最后询问是否还有其他问题"""


def create_agent_a():
    """创建变体 A 的 Agent"""
    llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
    prompt = ChatPromptTemplate.from_messages([
        ("system", PROMPT_A),
        ("human", "{input}")
    ])
    chain = prompt | llm

    def agent_func(query: str) -> dict:
        response = chain.invoke({"input": query})
        return {
            "output": response.content,
            "steps": 1,
            "tokens": response.response_metadata.get("token_usage", {}).get("total_tokens", 0)
        }

    return agent_func


def create_agent_b():
    """创建变体 B 的 Agent"""
    llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
    prompt = ChatPromptTemplate.from_messages([
        ("system", PROMPT_B),
        ("human", "{input}")
    ])
    chain = prompt | llm

    def agent_func(query: str) -> dict:
        response = chain.invoke({"input": query})
        return {
            "output": response.content,
            "steps": 1,
            "tokens": response.response_metadata.get("token_usage", {}).get("total_tokens", 0)
        }

    return agent_func


# 准备测试用例
test_cases = [
    TestCase(
        id="cs_001",
        query="我的订单还没到,已经超过预计时间3天了",
        category="物流查询",
        difficulty="easy"
    ),
    TestCase(
        id="cs_002",
        query="我想退货,但商品已经拆封了,还能退吗?",
        category="退换货",
        difficulty="medium"
    ),
    TestCase(
        id="cs_003",
        query="你们的会员制度和积分规则是什么?VIP有什么额外权益?",
        category="会员服务",
        difficulty="hard"
    ),
]

# 配置 A/B 测试
config = ABTestConfig(
    name="客服 Prompt 优化测试",
    description="对比简洁版和详细版 Prompt 的客服质量",
    variant_a_config={"prompt": "简洁版", "model": "gpt-4.1-mini"},
    variant_b_config={"prompt": "详细版", "model": "gpt-4.1-mini"},
    confidence_level=0.95
)

# 运行测试
framework = ABTestFramework(
    agent_factory_a=create_agent_a,
    agent_factory_b=create_agent_b,
    judge_model="gpt-4.1"
)

result = framework.run_test(test_cases, config)

# 输出结果
print(f"变体 A 平均分:{result['analysis']['summary']['variant_a_avg_score']:.2f}")
print(f"变体 B 平均分:{result['analysis']['summary']['variant_b_avg_score']:.2f}")
print(f"获胜者:{result['analysis']['summary']['winner']}")
print(f"质量差异显著:{result['analysis']['quality']['significant']}")
print(f"效应量:{result['analysis']['quality']['effect_size']}")

A/B 测试注意事项

问题说明应对策略
样本量不足统计检验需要足够的样本每个变体至少 30 个测试用例
随机性干扰LLM 输出的随机性可能影响结果每个用例运行 3-5 次取平均
Judge 偏差评估模型可能偏好某种风格使用多个 Judge 取平均
过拟合测试集反复优化导致对测试集过拟合保留 hold-out 测试集
多重比较同时测试多个指标增加假阳性使用 Bonferroni 校正

💡 最佳实践:在运行 A/B 测试前,先用功效分析(Power Analysis)计算所需样本量。小样本的"显著"结果往往不可靠。


回归测试:防止 Prompt 修改破坏已有能力

什么是 Agent 回归测试?

回归测试的核心目标:确保新的修改不会破坏已有的正确行为。在 Agent 开发中,这意味着每次修改 Prompt、工具描述或模型参数后,都要验证关键场景仍然正常工作。

回归测试策略

策略说明适用场景
快照测试记录期望输出,检查新输出是否偏离输出相对固定的场景
行为测试检查关键行为是否正确(如工具调用)工具使用场景
语义测试用 LLM Judge 检查语义等价性输出变化但语义不变的场景
边界测试测试极端输入和边界情况稳健性验证

回归测试框架实现

class RegressionTestSuite:
    """Agent 回归测试套件"""

    def __init__(
        self,
        agent_factory: Callable,
        judge_model: str = "gpt-4.1"
    ):
        self.agent_factory = agent_factory
        self.judge_llm = ChatOpenAI(model=judge_model, temperature=0)
        self.baselines: dict[str, dict] = {}  # 基线结果
        self.test_cases: list[dict] = []

    def register_test(
        self,
        name: str,
        query: str,
        test_type: str = "semantic",  # snapshot / behavior / semantic / boundary
        expected_tools: list[str] = None,
        expected_keywords: list[str] = None,
        expected_snapshot: str = None,
        category: str = "default",
        tolerance: float = 0.1  # 语义相似度容差
    ):
        """注册回归测试用例"""
        self.test_cases.append({
            "name": name,
            "query": query,
            "type": test_type,
            "expected_tools": expected_tools or [],
            "expected_keywords": expected_keywords or [],
            "expected_snapshot": expected_snapshot,
            "category": category,
            "tolerance": tolerance
        })

    def save_baseline(self, output_path: str = "baseline.json"):
        """保存当前结果作为基线"""
        baseline_results = {}
        agent = self.agent_factory()

        for case in self.test_cases:
            result = agent(case["query"])
            baseline_results[case["name"]] = {
                "output": result if isinstance(result, str) else str(result),
                "query": case["query"],
                "timestamp": time.time()
            }

        with open(output_path, "w") as f:
            json.dump(baseline_results, f, ensure_ascii=False, indent=2)

        self.baselines = baseline_results
        print(f"基线已保存到 {output_path},共 {len(baseline_results)} 个用例")

    def load_baseline(self, input_path: str = "baseline.json"):
        """加载已有基线"""
        with open(input_path, "r") as f:
            self.baselines = json.load(f)
        print(f"已加载基线,共 {len(self.baselines)} 个用例")

    def run_regression(self) -> dict:
        """运行回归测试"""
        agent = self.agent_factory()
        results = []

        for case in self.test_cases:
            # 执行测试
            actual_output = agent(case["query"])
            actual_str = actual_output if isinstance(actual_output, str) else str(actual_output)

            # 根据类型检查
            if case["type"] == "snapshot":
                passed = self._check_snapshot(case, actual_str)
            elif case["type"] == "behavior":
                passed = self._check_behavior(case, actual_output)
            elif case["type"] == "semantic":
                passed = self._check_semantic(case, actual_str)
            elif case["type"] == "boundary":
                passed = self._check_boundary(case, actual_str)
            else:
                passed = True  # 未知类型默认通过

            results.append({
                "name": case["name"],
                "category": case["category"],
                "type": case["type"],
                "passed": passed,
                "query": case["query"]
            })

        # 汇总
        total = len(results)
        passed = sum(1 for r in results if r["passed"])

        return {
            "total": total,
            "passed": passed,
            "failed": total - passed,
            "pass_rate": passed / total if total > 0 else 0,
            "details": results
        }

    def _check_snapshot(self, case: dict, actual: str) -> bool:
        """快照测试:检查输出是否与基线完全匹配"""
        baseline = self.baselines.get(case["name"])
        if not baseline:
            return True  # 无基线则跳过

        expected = baseline["output"]
        # 允许空白差异
        return actual.strip() == expected.strip()

    def _check_behavior(self, case: dict, actual_output) -> bool:
        """行为测试:检查关键行为是否正确"""
        checks_passed = 0
        total_checks = 0

        # 检查工具调用
        if case["expected_tools"]:
            total_checks += 1
            actual_tools = set()
            if isinstance(actual_output, dict):
                actual_tools = set(actual_output.get("tools_called", []))

            expected_tools = set(case["expected_tools"])
            if expected_tools.issubset(actual_tools):
                checks_passed += 1

        # 检查关键词
        if case["expected_keywords"]:
            total_checks += 1
            actual_str = str(actual_output)
            if all(kw in actual_str for kw in case["expected_keywords"]):
                checks_passed += 1

        if total_checks == 0:
            return True

        return checks_passed == total_checks

    def _check_semantic(self, case: dict, actual: str) -> bool:
        """语义测试:用 LLM Judge 检查语义等价性"""
        baseline = self.baselines.get(case["name"])
        if not baseline:
            return True

        expected = baseline["output"]

        prompt = f"""请判断以下两个回答在语义上是否等价。

问题:{case['query']}

回答 A(基线):
{expected}

回答 B(当前):
{actual}

只回复 JSON:{{"equivalent": true/false, "confidence": 0.0-1.0}}"""

        response = self.judge_llm.invoke(prompt)
        try:
            result = json.loads(response.content)
            return result.get("equivalent", False) and result.get("confidence", 0) >= (1 - case["tolerance"])
        except json.JSONDecodeError:
            return False

    def _check_boundary(self, case: dict, actual: str) -> bool:
        """边界测试:检查输出是否合理(没有崩溃、空输出等)"""
        # 基本检查
        if not actual or len(actual.strip()) < 5:
            return False

        # 检查是否有异常标记
        error_markers = ["error", "exception", "traceback", "I cannot", "无法完成"]
        actual_lower = actual.lower()
        for marker in error_markers:
            if marker in actual_lower:
                return False

        return True

实战:为客服 Agent 建立回归测试

# 创建回归测试套件
agent_factory = create_agent_a  # 使用你的 Agent 工厂函数

suite = RegressionTestSuite(
    agent_factory=agent_factory,
    judge_model="gpt-4.1"
)

# 注册测试用例
suite.register_test(
    name="订单查询",
    query="查询我的订单状态",
    test_type="behavior",
    expected_tools=["query_order"],
    expected_keywords=["订单"],
    category="核心功能"
)

suite.register_test(
    name="退货流程",
    query="我想退货,请问怎么操作?",
    test_type="semantic",
    category="核心功能"
)

suite.register_test(
    name="空输入处理",
    query="",
    test_type="boundary",
    category="边界处理"
)

suite.register_test(
    name="超长输入处理",
    query="请帮我处理" + "非常紧急" * 100,
    test_type="boundary",
    category="边界处理"
)

# 首次运行:保存基线
suite.save_baseline("regression_baseline.json")

# 后续运行:加载基线并回归测试
suite.load_baseline("regression_baseline.json")
results = suite.run_regression()

print(f"回归测试结果:{results['passed']}/{results['total']} 通过")
print(f"通过率:{results['pass_rate']:.1%}")

# 输出失败用例
for detail in results["details"]:
    if not detail["passed"]:
        print(f"  ❌ {detail['name']} ({detail['type']}): {detail['query']}")

CI/CD 集成:自动化评估流水线

为什么需要 CI/CD 集成?

手动运行测试容易遗漏。将 Agent 评估集成到 CI/CD 流水线中,可以在每次代码变更时自动运行评估,及时发现问题。

GitHub Actions 自动评估配置

# .github/workflows/agent_eval.yml
name: Agent Evaluation

on:
  pull_request:
    paths:
      - 'agent/**'
      - 'prompts/**'
      - 'tests/**'
  schedule:
    # 每天凌晨 2 点运行完整评估
    - cron: '0 2 * * *'

jobs:
  regression-test:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install dependencies
        run: |
          pip install -r requirements.txt
          pip install pytest scipy numpy

      - name: Run Regression Tests
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python -m pytest tests/regression/ -v --tb=short

      - name: Run Agent A/B Test
        if: github.event_name == 'pull_request'
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: |
          python scripts/run_ab_test.py \
            --baseline main \
            --variant ${{ github.head_ref }} \
            --output results/ab_test.json

      - name: Check Quality Gate
        run: |
          python scripts/check_quality_gate.py \
            --results results/ab_test.json \
            --min-pass-rate 0.85 \
            --max-regression 0.05

      - name: Upload Results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: evaluation-results
          path: results/

质量门禁脚本

"""
质量门禁检查脚本
用于 CI/CD 流水线,根据评估结果决定是否允许合并
"""
import json
import sys
from dataclasses import dataclass


@dataclass
class QualityGateConfig:
    """质量门禁配置"""
    min_pass_rate: float = 0.85       # 最低回归测试通过率
    max_regression: float = 0.05      # 最大允许退步幅度
    min_ab_score: float = 0.7         # A/B 测试最低得分
    max_latency_increase: float = 0.2 # 最大延迟增加比例
    max_token_increase: float = 0.2   # 最大 Token 增加比例


class QualityGate:
    """质量门禁检查"""

    def __init__(self, config: QualityGateConfig = None):
        self.config = config or QualityGateConfig()

    def check(
        self,
        regression_results: dict = None,
        ab_test_results: dict = None
    ) -> dict:
        """执行质量门禁检查"""
        checks = []

        # 检查 1:回归测试通过率
        if regression_results:
            pass_rate = regression_results.get("pass_rate", 0)
            checks.append({
                "name": "回归测试通过率",
                "value": pass_rate,
                "threshold": self.config.min_pass_rate,
                "passed": pass_rate >= self.config.min_pass_rate,
                "message": (
                    f"通过率 {pass_rate:.1%} >= {self.config.min_pass_rate:.1%}"
                    if pass_rate >= self.config.min_pass_rate
                    else f"通过率 {pass_rate:.1%} < {self.config.min_pass_rate:.1%}"
                )
            })

        # 检查 2:A/B 测试退步幅度
        if ab_test_results:
            analysis = ab_test_results.get("analysis", {})
            quality = analysis.get("quality", {})
            score_a = quality.get("mean_a", 0)
            score_b = quality.get("mean_b", 0)

            # B 是新版本,A 是基线
            regression = max(0, score_a - score_b)

            checks.append({
                "name": "质量退步幅度",
                "value": regression,
                "threshold": self.config.max_regression,
                "passed": regression <= self.config.max_regression,
                "message": (
                    f"退步 {regression:.3f} <= {self.config.max_regression}"
                    if regression <= self.config.max_regression
                    else f"退步 {regression:.3f} > {self.config.max_regression}"
                )
            })

            # 检查 3:延迟增加
            latency = analysis.get("latency", {})
            lat_a = latency.get("mean_a", 0)
            lat_b = latency.get("mean_b", 0)
            if lat_a > 0:
                latency_increase = (lat_b - lat_a) / lat_a
                checks.append({
                    "name": "延迟增加比例",
                    "value": latency_increase,
                    "threshold": self.config.max_latency_increase,
                    "passed": latency_increase <= self.config.max_latency_increase,
                    "message": (
                        f"延迟增加 {latency_increase:.1%} <= {self.config.max_latency_increase:.1%}"
                        if latency_increase <= self.config.max_latency_increase
                        else f"延迟增加 {latency_increase:.1%} > {self.config.max_latency_increase:.1%}"
                    )
                })

            # 检查 4:Token 增加比例
            token_usage = analysis.get("token_usage", {})
            tok_a = token_usage.get("mean_a", 0)
            tok_b = token_usage.get("mean_b", 0)
            if tok_a > 0:
                token_increase = (tok_b - tok_a) / tok_a
                checks.append({
                    "name": "Token 增加比例",
                    "value": token_increase,
                    "threshold": self.config.max_token_increase,
                    "passed": token_increase <= self.config.max_token_increase,
                    "message": (
                        f"Token 增加 {token_increase:.1%} <= {self.config.max_token_increase:.1%}"
                        if token_increase <= self.config.max_token_increase
                        else f"Token 增加 {token_increase:.1%} > {self.config.max_token_increase:.1%}"
                    )
                })

        all_passed = all(c["passed"] for c in checks)

        return {
            "passed": all_passed,
            "checks": checks,
            "summary": (
                "所有质量门禁检查通过 ✅"
                if all_passed
                else f"{sum(1 for c in checks if not c['passed'])} 项检查未通过 ❌"
            )
        }


def main():
    """CI/CD 入口"""
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--regression", help="回归测试结果文件")
    parser.add_argument("--ab-test", help="A/B 测试结果文件")
    parser.add_argument("--min-pass-rate", type=float, default=0.85)
    parser.add_argument("--max-regression", type=float, default=0.05)
    args = parser.parse_args()

    # 加载结果
    regression_results = None
    ab_test_results = None

    if args.regression:
        with open(args.regression) as f:
            regression_results = json.load(f)

    if args.ab_test:
        with open(args.ab_test) as f:
            ab_test_results = json.load(f)

    # 运行质量门禁
    config = QualityGateConfig(
        min_pass_rate=args.min_pass_rate,
        max_regression=args.max_regression
    )
    gate = QualityGate(config)
    result = gate.check(regression_results, ab_test_results)

    print(result["summary"])
    for check in result["checks"]:
        status = "✅" if check["passed"] else "❌"
        print(f"  {status} {check['name']}: {check['message']}")

    # 非零退出码表示检查未通过
    sys.exit(0 if result["passed"] else 1)


if __name__ == "__main__":
    main()

测试流水线架构

代码变更 → CI/CD 触发
    │
    ├── 1. 回归测试(快速,< 5 分钟)
    │   ├── 快照测试
    │   ├── 行为测试
    │   └── 边界测试
    │
    ├── 2. A/B 测试(中等,10-30 分钟)
    │   ├── 运行基线版本
    │   ├── 运行新版本
    │   └── LLM Judge 评估
    │
    └── 3. 质量门禁检查
        ├── 回归通过率 ≥ 85%
        ├── 质量退步 ≤ 5%
        ├── 延迟增加 ≤ 20%
        └── Token 增加 ≤ 20%

CI/CD 集成注意事项

问题说明应对策略
API 成本每次提交都跑评估,API 费用高PR 只跑回归测试,定时跑完整评估
执行时间LLM 评估耗时较长并行化 + 结果缓存
非确定性同一代码两次评估可能结果不同多次运行取平均 + 容差阈值
误报偶尔的质量波动被标记为退步设置合理的容差,人工复查
秘钥安全API Key 不能硬编码使用 GitHub Secrets

💡 最佳实践:为不同类型的变更设置不同的评估策略。Prompt 变更跑完整 A/B 测试,代码变更只跑回归测试,文档变更跳过评估。


小结

概念说明
A/B 测试在相同测试集上对比两个 Agent 变体,统计检验差异显著性
统计显著性用 Welch t-test 判断差异是否由随机波动导致
效应量Cohen's d 衡量差异的实际意义(小/中/大)
回归测试确保修改不会破坏已有能力,4 种策略:快照/行为/语义/边界
质量门禁CI/CD 中的自动化检查,不达标则阻止合并
CI/CD 集成GitHub Actions 自动运行评估,结果作为 PR 合并条件

下一节预告:我们将学习模型路由评估,了解如何根据任务复杂度智能选择模型,在成本和质量之间找到最佳平衡。


参考文献

[1] KOHAVI R, TANG D, XU Y. Trustworthy Online Controlled Experiments: A Practical Guide to A/B Testing[M]. Cambridge University Press, 2020.

[2] ZHENG L, CHIANG W L, SHENG Y, et al. Judging LLM-as-a-judge with MT-bench and chatbot arena[C]//NeurIPS. 2023.

[3] TAMARIT J, SNOEK J, METZE F. A/B Testing for LLMs: Practical Considerations and Pitfalls[J]. arXiv preprint, 2024.


17.8 模型路由评估