gen-eval/demo_interactive.py at main · savitharaghunathan/gen-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
#!/usr/bin/env python3
"""
Interactive Demo Script for GenEval Framework
Allows control over number of test cases and shows detailed evaluation output
"""

import json
import sys
from pathlib import Path

import yaml
from dotenv import load_dotenv

from geneval import GenEvalFramework

load_dotenv()

# Add the project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

# # Configure logging to show INFO level messages
# logging.basicConfig(
#     level=logging.INFO,
#     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
#     handlers=[
#         logging.StreamHandler(sys.stdout)
#     ]
# )


def load_test_data():
    """Load test data from YAML file"""
    test_data_path = project_root / "tests" / "test_data_clean.yaml"

    if not test_data_path.exists():
        print(f"Test data file not found: {test_data_path}")
        return None

    with open(test_data_path) as f:
        return yaml.safe_load(f)


def get_user_preferences():
    """Get user preferences for the demo"""
    print("GenEval Framework Interactive Demo")
    print("=" * 50)

    # Check if config file exists
    config_path = project_root / "config" / "llm_config.yaml"
    if not config_path.exists():
        print(f"\n LLM configuration file not found: {config_path}")
        print("Please create a config file with your LLM provider settings.")
        print("Example config:")
        print("  providers:")
        print("    openai:")
        print("      enabled: true")
        print("      default: true")
        print('      api_key_env: "OPENAI_API_KEY"')
        print('      model: "gpt-4o-mini"')
        print("\nExiting demo - please configure your LLM settings first.")
        return None, None

    print(f"\nUsing LLM configuration from: {config_path}")
    print("The framework will use the default provider from your config file.")

    # Number of test cases
    while True:
        try:
            num_cases = input("\nHow many test cases to run? (1-10, default: 3): ").strip()
            if not num_cases:
                num_cases = 3
                break
            num_cases = int(num_cases)
            if 1 <= num_cases <= 10:
                break
            else:
                print("Please enter a number between 1 and 10")
        except ValueError:
            print("Please enter a valid number")

    # Metric selection - show unique metrics
    unique_metrics = [
        "context_precision_without_reference",
        "context_precision_with_reference",
        "context_recall",
        "context_entity_recall",
        "noise_sensitivity",
        "answer_relevancy",
        "faithfulness",
        "context_relevance",
        "context_precision",
    ]

    print("\nAvailable metrics ( 9 unique):")
    for i, metric in enumerate(unique_metrics, 1):
        print(f"{i:2d}. {metric}")

    print("\nNote: Some metrics (like 'faithfulness') are available in both RAGAS and DeepEval.")
    print("This will give you up to 12 total evaluations from  9 unique concepts.")

    print("\nMetric selection:")
    print("- Enter 'all' for all  9 metrics (will run 12 evaluations)")
    print("- Enter numbers (comma-separated, e.g., 1,3,6)")

    while True:
        selection = input("\nSelect metrics (default: all): ").strip().lower()
        if not selection or selection == "all":
            selected_metrics = unique_metrics
            break
        else:
            try:
                indices = [int(x.strip()) - 1 for x in selection.split(",")]
                if all(0 <= i < len(unique_metrics) for i in indices):
                    selected_metrics = [unique_metrics[i] for i in indices]
                    break
                else:
                    print("Invalid metric numbers. Please use numbers 1- 9.")
            except ValueError:
                print("Please enter 'all' or numbers separated by commas (e.g., 1,3,6)")

    return int(num_cases), selected_metrics


def convert_to_framework_metrics(selected_metrics, test_data):
    """Convert unique metrics to framework-specific format"""
    ragas_available = test_data["framework_config"]["metrics"]["ragas"]
    deepeval_available = test_data["framework_config"]["metrics"]["deepeval"]

    framework_metrics = []

    for metric in selected_metrics:
        # Add RAGAS version if available
        if metric in ragas_available:
            framework_metrics.append(f"ragas.{metric}")

        # Add DeepEval version if available
        if metric in deepeval_available:
            framework_metrics.append(f"deepeval.{metric}")

    return framework_metrics


def display_test_case_info(test_case, case_num, total_cases):
    """Display information about the current test case"""
    print(f"\n{'='*60}")
    print(f"TEST CASE {case_num}/{total_cases}: {test_case['id']}")
    print(f"{'='*60}")
    print(f"Question: {test_case['user_input']}")
    print(f"Response: {test_case['response']}")
    print(f"Reference: {test_case['reference']}")
    print(f"Context Length: {len(test_case['retrieved_contexts'])} characters")


def display_evaluation_results(results):
    """Display evaluation results in JSON format"""
    json_results = {}
    for metric_key, (adapter_name, output) in results.items():
        json_results[metric_key] = {"adapter": adapter_name, "metrics": []}

        for metric_result in output.metrics:
            metric_data = {
                "name": metric_result.name,
                "score": metric_result.score,
                "details": (metric_result.details if hasattr(metric_result, "details") else None),
            }
            json_results[metric_key]["metrics"].append(metric_data)

        if output.metadata:
            json_results[metric_key]["metadata"] = output.metadata

    print(json.dumps(json_results, indent=2, ensure_ascii=False))


def calculate_test_case_stats(all_results):
    """Calculate statistics by test case and adapter.metric"""
    adapter_metric_scores = {}

    for case_idx, case_results in enumerate(all_results):
        for _metric_key, (adapter_name, output) in case_results.items():
            for metric_result in output.metrics:
                # Use adapter.metric format to keep them separate
                full_metric_name = f"{adapter_name}.{metric_result.name}"
                if full_metric_name not in adapter_metric_scores:
                    adapter_metric_scores[full_metric_name] = []
                if metric_result.score is not None:
                    adapter_metric_scores[full_metric_name].append({"case": case_idx + 1, "score": metric_result.score})

    return adapter_metric_scores


def display_final_summary(all_results, metrics, num_cases):
    """Display final summary by test case and keep adapters separate"""
    print(f"\n{'='*100}")
    print(f"FINAL SUMMARY - {num_cases} Test Cases")
    print(f"{'='*100}")

    # Show which metrics ran in both frameworks
    ragas_metrics = [m for m in metrics if m.startswith("ragas.")]
    deepeval_metrics = [m for m in metrics if m.startswith("deepeval.")]

    print(f"Total evaluations: {len(metrics)} ({len(ragas_metrics)} RAGAS + {len(deepeval_metrics)} DeepEval)")
    print(f"Unique concepts: {len({m.split('.', 1)[1] for m in metrics})}")
    print(f"{'='*100}")

    adapter_metric_scores = calculate_test_case_stats(all_results)

    print(f"{'Adapter.Metric':<40} {'Cases':<8} {'Avg Score':<12} {'Min':<8} {'Max':<8}")
    print("-" * 85)

    for full_metric_name, score_data in sorted(adapter_metric_scores.items()):
        if score_data:
            scores = [item["score"] for item in score_data]
            avg_score = sum(scores) / len(scores)
            min_score = min(scores)
            max_score = max(scores)
            print(f"{full_metric_name:<40} {len(scores):<8} {avg_score:<12.3f} {min_score:<8.3f} {max_score:<8.3f}")
        else:
            print(f"{full_metric_name:<40} {'0':<8} {'N/A':<12} {'N/A':<8} {'N/A':<8}")

    # Test case summary
    print(f"\n{'='*60}")
    print("TEST CASE SUMMARY")
    print(f"{'='*60}")

    for case_idx in range(num_cases):
        case_num = case_idx + 1
        print(f"\nTest Case {case_num}:")
        case_metrics = {}

        if case_idx < len(all_results):
            case_results = all_results[case_idx]
            for _metric_key, (adapter_name, output) in case_results.items():
                for metric_result in output.metrics:
                    full_metric_name = f"{adapter_name}.{metric_result.name}"
                    if metric_result.score is not None:
                        case_metrics[full_metric_name] = metric_result.score

        if case_metrics:
            for metric_name, score in sorted(case_metrics.items()):
                print(f"  {metric_name:<35}: {score:.3f}")
        else:
            print("  No results available")


def main():
    """Main demo function"""
    # Load test data
    test_data = load_test_data()
    if not test_data:
        return

    # Get user preferences
    preferences = get_user_preferences()
    if preferences[0] is None:
        return
    num_cases, selected_metrics = preferences

    # Convert unique metrics to framework-specific format
    metrics = convert_to_framework_metrics(selected_metrics, test_data)

    print("\nConfiguration:")
    print("   LLM Provider: Config-driven")
    print(f"   Test cases: {num_cases}")
    print(f"   Selected metrics: {len(selected_metrics)} unique ({', '.join(selected_metrics)})")
    print(f"   Framework evaluations: {len(metrics)} total ({', '.join(metrics)})")

    # Show which metrics will run in both frameworks
    ragas_metrics = [m for m in metrics if m.startswith("ragas.")]
    deepeval_metrics = [m for m in metrics if m.startswith("deepeval.")]

    print("\nFramework breakdown:")
    print(f"   RAGAS evaluations: {len(ragas_metrics)} ({', '.join(ragas_metrics)})")
    print(f"   DeepEval evaluations: {len(deepeval_metrics)} ({', '.join(deepeval_metrics)})")

    # Show overlapping metrics
    overlapping = []
    for metric in selected_metrics:
        if metric in ["faithfulness", "answer_relevancy", "context_recall"]:
            overlapping.append(metric)

    if overlapping:
        print(f"\nOverlapping metrics (run in both frameworks): {', '.join(overlapping)}")
        print(f"Expected total: {len(selected_metrics) + len(overlapping)} evaluations")

    # Initialize framework with config path
    print("\nInitializing GenEval Framework...")
    try:
        config_path = str(project_root / "config" / "llm_config.yaml")
        framework = GenEvalFramework(config_path=config_path)
        print("Framework initialized successfully")
    except Exception as e:
        print(f"Framework initialization failed: {e}")
        print("Please check your config file and API keys.")
        print("Exiting demo - please configure your LLM settings and try again")
        return

    # Run evaluations
    all_results = []

    for i in range(num_cases):
        test_case = test_data["test_cases"][i]

        # Display test case info
        display_test_case_info(test_case, i + 1, num_cases)

        try:
            # Run actual evaluation
            print(f"\nRunning evaluation with {len(metrics)} metrics...")
            results = framework.evaluate(
                question=test_case["user_input"],
                response=test_case["response"],
                reference=test_case["reference"],
                retrieval_context=test_case["retrieved_contexts"],
                metrics=metrics,
            )

            # Display results
            display_evaluation_results(results)
            all_results.append(results)

        except Exception as e:
            print(f"Error evaluating case {i+1}: {e}")
            continue

    # Display final summary if we have results
    if all_results:
        display_final_summary(all_results, metrics, num_cases)

    print("\nDemo completed!")
    print(f"Evaluated {len(all_results)} test cases successfully")


if __name__ == "__main__":
    main()