python-agentframework-demos/examples/agent_evaluation_batch.py at main · Azure-Samples/python-agentframework-demos · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""Batch evaluation of agent responses using Azure AI Evaluation's evaluate() function.

Reads evaluation data from a JSONL file (produced by agent_evaluation_generate.py) and runs
all evaluators in a single batch call. Optionally logs results to Microsoft Foundry
if AZURE_AI_PROJECT is set.

Usage:
    python agent_evaluation_batch.py                          # uses eval_data.jsonl
    AZURE_AI_PROJECT=<url> python agent_evaluation_batch.py   # logs to Microsoft Foundry
"""

import logging
import os
from pathlib import Path

import rich
from azure.ai.evaluation import (
    AzureOpenAIModelConfiguration,
    IntentResolutionEvaluator,
    OpenAIModelConfiguration,
    ResponseCompletenessEvaluator,
    TaskAdherenceEvaluator,
    ToolCallAccuracyEvaluator,
    evaluate,
)
from dotenv import load_dotenv
from rich.logging import RichHandler
from rich.table import Table

handler = RichHandler(show_path=False, rich_tracebacks=True, show_level=False)
logging.basicConfig(level=logging.WARNING, handlers=[handler], force=True, format="%(message)s")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

load_dotenv(override=True)
API_HOST = os.getenv("API_HOST", "azure")

if API_HOST == "azure":
    model_config = AzureOpenAIModelConfiguration(
        type="azure_openai",
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
    )
else:
    model_config = OpenAIModelConfiguration(
        type="openai",
        api_key=os.environ["OPENAI_API_KEY"],
        model=os.environ.get("OPENAI_MODEL", "gpt-5.4"),
    )

# Optional: Set AZURE_AI_PROJECT in .env to log results to Microsoft Foundry.
# Example: https://your-account.services.ai.azure.com/api/projects/your-project
AZURE_AI_PROJECT = os.getenv("AZURE_AI_PROJECT")


def display_evaluation_results(eval_result: dict) -> None:
    """Display batch evaluation results in a formatted table using rich."""
    result_keys = {
        "IntentResolution": "intent_resolution",
        "ResponseCompleteness": "response_completeness",
        "TaskAdherence": "task_adherence",
        "ToolCallAccuracy": "tool_call_accuracy",
    }

    rows = eval_result.get("rows", [])

    for i, row in enumerate(rows):
        table = Table(title=f"Evaluation Results - Row {i + 1}", show_lines=True)
        table.add_column("Evaluator", style="cyan", width=28)
        table.add_column("Score", style="bold", justify="center", width=8)
        table.add_column("Result", justify="center", width=8)
        table.add_column("Reason", style="dim", width=70)

        for display_name, key in result_keys.items():
            score = str(row.get(f"outputs.{key}.{key}", "N/A"))
            pass_fail = row.get(f"outputs.{key}.{key}_result", "N/A")
            reason = row.get(f"outputs.{key}.{key}_reason", "N/A")

            if pass_fail == "pass":
                result_str = "[green]pass[/green]"
            elif pass_fail == "fail":
                result_str = "[red]fail[/red]"
            else:
                result_str = str(pass_fail)

            table.add_row(display_name, score, result_str, reason)

        rich.print()
        rich.print(table)


def main() -> None:
    """Run batch evaluation on a JSONL data file."""
    eval_data_file = Path(__file__).parent / "eval_data.jsonl"

    if not eval_data_file.exists():
        logger.error(f"Data file not found: {eval_data_file}")
        logger.error("Run agent_evaluation_generate.py first to generate evaluation data.")
        return

    logger.info(f"Running batch evaluation on {eval_data_file}...")

    optional_kwargs: dict = {}
    if AZURE_AI_PROJECT:
        logger.info(f"Logging results to Azure AI project: {AZURE_AI_PROJECT}")
        optional_kwargs["azure_ai_project"] = AZURE_AI_PROJECT
    else:
        optional_kwargs["output_path"] = str(Path(__file__).parent / "eval_results.json")

    eval_result = evaluate(
        data=eval_data_file,
        evaluators={
            "intent_resolution": IntentResolutionEvaluator(model_config, is_reasoning_model=True),
            "response_completeness": ResponseCompletenessEvaluator(model_config, is_reasoning_model=True),
            "task_adherence": TaskAdherenceEvaluator(model_config, is_reasoning_model=True),
            "tool_call_accuracy": ToolCallAccuracyEvaluator(model_config, is_reasoning_model=True),
        },
        # ResponseCompletenessEvaluator expects a plain text response, not a message list,
        # so we override its column mapping to use response_text and ground_truth.
        # Other evaluators auto-map correctly since data keys match param names.
        evaluator_config={
            "response_completeness": {
                "column_mapping": {
                    "response": "${data.response_text}",
                    "ground_truth": "${data.ground_truth}",
                }
            },
        },
        **optional_kwargs,
    )

    display_evaluation_results(eval_result)

    if AZURE_AI_PROJECT:
        studio_url = eval_result.get("studio_url")
        if studio_url:
            print(f"\nView results in Microsoft Foundry:\n{studio_url}")
    else:
        logger.info("Results saved to eval_results.json")


if __name__ == "__main__":
    main()