UiPath
diff --git a/‎src/uipath/_cli/_evals/_models/_output.py‎
Lines changed: 1 addition & 1 deletion b/‎src/uipath/_cli/_evals/_models/_output.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/uipath/eval/_helpers/coded_evaluators_helpers.py‎
Lines changed: 35 additions & 21 deletions b/‎src/uipath/eval/_helpers/coded_evaluators_helpers.py‎
Lines changed: 35 additions & 21 deletions
diff --git a/‎src/uipath/eval/coded_evaluators/base_evaluator.py‎
Lines changed: 101 additions & 2 deletions b/‎src/uipath/eval/coded_evaluators/base_evaluator.py‎
Lines changed: 101 additions & 2 deletions
diff --git a/‎src/uipath/eval/coded_evaluators/contains_evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎src/uipath/eval/coded_evaluators/contains_evaluator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/uipath/eval/coded_evaluators/exact_match_evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎src/uipath/eval/coded_evaluators/exact_match_evaluator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/uipath/eval/coded_evaluators/json_similarity_evaluator.py‎
Lines changed: 3 additions & 2 deletions b/‎src/uipath/eval/coded_evaluators/json_similarity_evaluator.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py‎
Lines changed: 5 additions & 2 deletions b/‎src/uipath/eval/coded_evaluators/llm_as_judge_evaluator.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/uipath/eval/coded_evaluators/llm_judge_output_evaluator.py‎
Lines changed: 1 addition & 1 deletion b/‎src/uipath/eval/coded_evaluators/llm_judge_output_evaluator.py‎
Lines changed: 1 addition & 1 deletion
@@ -22,7 +22,7 @@ class EvaluationResultDto(BaseModel):
     model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
 
     score: float
-    details: Optional[str] = None
+    details: Optional[str | BaseModel] = None
     evaluation_time: Optional[float] = None
 
     @model_serializer(mode="wrap")
 
@@ -92,7 +92,7 @@ def tool_calls_order_score(
     actual_tool_calls_names: Sequence[str],
     expected_tool_calls_names: Sequence[str],
     strict: bool = False,
-) -> tuple[float, str]:
+) -> tuple[float, dict[str, Any]]:
     """The function calculates a score based on LCS applied to the order of the tool calls.
 
     It calculates the longest common subsequence between the actual tool calls
@@ -107,18 +107,22 @@ def tool_calls_order_score(
     Returns:
         tuple[float, str]: Ratio of the LCS length to the number of expected, and the LCS string
     """
-    justification_template = f"Expected tool calls: {expected_tool_calls_names}\nActual tool calls: {actual_tool_calls_names}"
-    if not strict:
-        justification_template += "\nLongest common subsequence: {lcs}"
+    justification = {
+        "actual_tool_calls_order": actual_tool_calls_names,
+        "expected_tool_calls_order": expected_tool_calls_names,
+        "lcs": [],
+    }
+
     if expected_tool_calls_names == actual_tool_calls_names:
-        return 1.0, justification_template.format(lcs=actual_tool_calls_names)
+        justification["lcs"] = actual_tool_calls_names
+        return 1.0, justification
     elif (
         not expected_tool_calls_names
         or not actual_tool_calls_names
         or strict
         and actual_tool_calls_names != expected_tool_calls_names
     ):
-        return 0.0, justification_template.format(lcs="")
+        return 0.0, justification
 
     # Calculate LCS with full DP table for efficient reconstruction
     m, n = len(actual_tool_calls_names), len(expected_tool_calls_names)
@@ -147,14 +151,16 @@ def tool_calls_order_score(
 
     lcs.reverse()  # Reverse to get correct order
     lcs_length = len(lcs)
-    return lcs_length / n, justification_template.format(lcs=" ".join(lcs))
+    justification["lcs"] = lcs
+    return lcs_length / n, justification
 
 
 def tool_calls_count_score(
     actual_tool_calls_count: Mapping[str, int],
     expected_tool_calls_count: Mapping[str, tuple[str, int]],
     strict: bool = False,
-) -> tuple[float, str]:
+    justification_key: str = "explained_tool_calls_count",
+) -> tuple[float, dict[str, Any]]:
     """Check if the expected tool calls are correctly called, where expected args must be a subset of actual args.
 
     Args:
@@ -166,34 +172,38 @@ def tool_calls_count_score(
         tuple[float, str]: Score based on the number of matches, and the justification.
     """
     if not expected_tool_calls_count and not actual_tool_calls_count:
-        return 1.0, "Both expected and actual tool calls are empty"
+        return 1.0, {justification_key: "Both expected and actual tool calls are empty"}
     elif not expected_tool_calls_count or not actual_tool_calls_count:
-        return 0.0, "Either expected or actual tool calls are empty"
+        return 0.0, {
+            justification_key: "Either expected or actual tool calls are empty"
+        }
 
     score = 0.0
-    justifications = []
+    justifications = {justification_key: {}}
     for tool_name, (
         expected_comparator,
         expected_count,
     ) in expected_tool_calls_count.items():
         actual_count = actual_tool_calls_count.get(tool_name, 0.0)
         comparator = f"__{COMPARATOR_MAPPINGS[expected_comparator]}__"
         to_add = float(getattr(actual_count, comparator)(expected_count))
-        justifications.append(
-            f"{tool_name}: Actual count: {actual_count}, Expected count: {expected_count}, Score: {to_add}"
+
+        justifications[justification_key][tool_name] = (
+            f"Actual: {actual_count}, Expected: {expected_count}, Score: {to_add}"
         )
         if strict and to_add == 0.0:
-            return 0.0, justifications[-1]
+            return 0.0, justifications
         score += to_add
-    return score / len(expected_tool_calls_count), "\n".join(justifications)
+    return score / len(expected_tool_calls_count), justifications
 
 
 def tool_calls_args_score(
     actual_tool_calls: list[ToolCall],
     expected_tool_calls: list[ToolCall],
     strict: bool = False,
     subset: bool = False,
-) -> tuple[float, str]:
+    justification_key: str = "explained_tool_calls_args",
+) -> tuple[float, dict[str, Any]]:
     """Check if the expected tool calls are correctly called, where expected args must be a subset of actual args.
 
     This function does not check the order of the tool calls!
@@ -208,13 +218,15 @@ def tool_calls_args_score(
         tuple[float, str]: Score based on the number of matches, and the justification
     """
     if not expected_tool_calls and not actual_tool_calls:
-        return 1.0, "Both expected and actual tool calls are empty"
+        return 1.0, {justification_key: "Both expected and actual tool calls are empty"}
     elif not expected_tool_calls or not actual_tool_calls:
-        return 0.0, "Either expected or actual tool calls are empty"
+        return 0.0, {
+            justification_key: "Either expected or actual tool calls are empty"
+        }
 
     cnt = 0
     visited: set[int] = set()
-    justifications = []
+    justifications = {justification_key: {}}
     for expected_tool_call in expected_tool_calls:
         for idx, call in enumerate(actual_tool_calls):
             if call.name == expected_tool_call.name and idx not in visited:
@@ -237,7 +249,9 @@ def tool_calls_args_score(
                     # Only possible in exact mode when key is missing
                     args_match = False
 
-                justifications.append(f"{call.name}: Args match: {args_match}")
+                justifications[justification_key][call.name] = (
+                    f"Actual: {call.args}, Expected: {expected_tool_call.args}, Score: {float(args_match)}"
+                )
                 if args_match:
                     cnt += 1
                     visited.add(idx)
@@ -247,7 +261,7 @@ def tool_calls_args_score(
         cnt / len(expected_tool_calls)
         if not strict
         else float(cnt == len(expected_tool_calls))
-    ), "\n".join(justifications)
+    ), justifications
 
 
 def tool_output_score(
 
@@ -5,7 +5,7 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from typing import Any, Generic, TypeVar, get_args
+from typing import Any, Generic, TypeVar, Union, cast, get_args
 
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 
@@ -47,11 +47,18 @@ class BaseEvaluatorConfig(BaseModel):
     default_evaluation_criteria: BaseEvaluationCriteria | None = None
 
 
+class BaseEvaluatorJustification(BaseModel):
+    """Base class for all evaluator justifications."""
+
+    pass
+
+
 T = TypeVar("T", bound=BaseEvaluationCriteria)
 C = TypeVar("C", bound=BaseEvaluatorConfig)
+J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification])
 
 
-class BaseEvaluator(BaseModel, Generic[T, C], ABC):
+class BaseEvaluator(BaseModel, Generic[T, C, J], ABC):
     """Abstract base class for all evaluators."""
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -61,6 +68,9 @@ class BaseEvaluator(BaseModel, Generic[T, C], ABC):
     evaluation_criteria_type: type[T] = Field(
         description="The type used for evaluation criteria validation and creation"
     )
+    justification_type: type[J] = Field(
+        description="The type used for justification validation and creation"
+    )
     evaluator_config: C = Field(
         exclude=True, description="The validated config object instance"
     )
@@ -101,6 +111,10 @@ def validate_model(cls, values: Any) -> Any:
             config_type = cls._extract_config_type()
             values["config_type"] = config_type
 
+            # Always extract and set justification_type
+            justification_type = cls._extract_justification_type()
+            values["justification_type"] = justification_type
+
             # Validate and create the config object if config dict is provided
             if config_dict := values.get("config"):
                 try:
@@ -182,6 +196,33 @@ def _extract_config_type(cls) -> type[BaseEvaluatorConfig]:
             f"Ensure the class properly inherits from BaseEvaluator with correct Generic parameters."
         )
 
+    @classmethod
+    def _extract_justification_type(cls) -> type[J]:
+        """Extract the justification type from Pydantic model fields.
+
+        Returns:
+            The justification type (str, None, or BaseEvaluatorJustification subclass)
+        """
+        if cls.__name__ == "BaseEvaluator":
+            return cast(type[J], type(None))
+
+        if hasattr(cls, "model_fields") and "justification_type" in cls.model_fields:
+            field_info = cls.model_fields["justification_type"]
+            if hasattr(field_info, "annotation"):
+                annotation = field_info.annotation
+                if args := get_args(annotation):
+                    justification_type = args[0]
+                    # Support str, None, or BaseEvaluatorJustification subclasses
+                    if justification_type is str or justification_type is type(None):
+                        return cast(type[J], justification_type)
+                    elif isinstance(justification_type, type) and issubclass(
+                        justification_type, BaseEvaluatorJustification
+                    ):
+                        return cast(type[J], justification_type)
+
+        # Default to None if we can't determine the type
+        return cast(type[J], type(None))
+
     def validate_evaluation_criteria(self, criteria: Any) -> T:
         """Validate and convert input to the correct evaluation criteria type.
 
@@ -213,6 +254,64 @@ def validate_evaluation_criteria(self, criteria: Any) -> T:
                     f"Cannot convert {type(criteria)} to {self.evaluation_criteria_type}: {e}"
                 ) from e
 
+    def validate_justification(self, justification: Any) -> J:
+        """Validate and convert input to the correct justification type.
+
+        Args:
+            justification: The justification to validate (str, None, dict, BaseEvaluatorJustification, or other)
+
+        Returns:
+            The validated justification of the correct type
+        """
+        # The key insight: J is constrained to be one of str, None, or BaseEvaluatorJustification
+        # At instantiation time, J gets bound to exactly one of these types
+        # We need to handle each case and ensure the return matches the bound type
+
+        # Handle None type - when J is bound to None (the literal None type)
+        if self.justification_type is type(None):
+            # When J is None, we can only return None
+            return cast(J, justification if justification is None else None)
+
+        # Handle str type - when J is bound to str
+        if self.justification_type is str:
+            # When J is str, we must return a str
+            if justification is None:
+                return cast(J, "")
+            return cast(J, str(justification))
+
+        # Handle BaseEvaluatorJustification subclasses - when J is bound to a specific subclass
+        if isinstance(self.justification_type, type) and issubclass(
+            self.justification_type, BaseEvaluatorJustification
+        ):
+            # When J is a BaseEvaluatorJustification subclass, we must return that type
+            if justification is None:
+                raise ValueError(
+                    f"None is not allowed for justification type {self.justification_type}"
+                )
+
+            if isinstance(justification, self.justification_type):
+                return cast(J, justification)
+            elif isinstance(justification, dict):
+                return cast(J, self.justification_type.model_validate(justification))
+            elif hasattr(justification, "__dict__"):
+                return cast(
+                    J, self.justification_type.model_validate(justification.__dict__)
+                )
+            else:
+                try:
+                    return cast(
+                        J, self.justification_type.model_validate(justification)
+                    )
+                except Exception as e:
+                    raise ValueError(
+                        f"Cannot convert {type(justification)} to {self.justification_type}: {e}"
+                    ) from e
+
+        # Fallback: try to return as-is or raise error
+        raise ValueError(
+            f"Unsupported justification type {self.justification_type} for input {type(justification)}"
+        )
+
     @classmethod
     def get_evaluation_criteria_schema(cls) -> dict[str, Any]:
         """Get the JSON schema for the evaluation criteria type.
 
@@ -20,7 +20,7 @@ class ContainsEvaluatorConfig(BaseEvaluatorConfig):
 
 
 class ContainsEvaluator(
-    BaseEvaluator[ContainsEvaluationCriteria, ContainsEvaluatorConfig]
+    BaseEvaluator[ContainsEvaluationCriteria, ContainsEvaluatorConfig, None]
 ):
     """Evaluator that checks if the actual output contains the expected output.
 
 
@@ -16,7 +16,7 @@ class ExactMatchEvaluatorConfig(OutputEvaluatorConfig):
     negated: bool = False
 
 
-class ExactMatchEvaluator(OutputEvaluator[ExactMatchEvaluatorConfig]):
+class ExactMatchEvaluator(OutputEvaluator[ExactMatchEvaluatorConfig, type(None)]):
     """Evaluator that performs exact structural matching between expected and actual outputs.
 
     This evaluator returns True if the actual output exactly matches the expected output
 
@@ -20,7 +20,7 @@ class JsonSimilarityEvaluatorConfig(OutputEvaluatorConfig):
     target_output_key: str = Field(default="*", frozen=True, exclude=True)
 
 
-class JsonSimilarityEvaluator(OutputEvaluator[JsonSimilarityEvaluatorConfig]):
+class JsonSimilarityEvaluator(OutputEvaluator[JsonSimilarityEvaluatorConfig, str]):
     """Deterministic evaluator that scores structural JSON similarity between expected and actual output.
 
     Compares expected versus actual JSON-like structures and returns a
@@ -51,9 +51,10 @@ async def evaluate(
             self._get_expected_output(evaluation_criteria),
             self._get_actual_output(agent_execution),
         )
+        validated_justification = self.validate_justification(justification)
         return NumericEvaluationResult(
             score=score,
-            details=justification,
+            details=validated_justification,
         )
 
     def _compare_json(self, expected: Any, actual: Any) -> tuple[float, str]:
 
@@ -36,7 +36,7 @@ class BaseLLMJudgeEvaluatorConfig(BaseEvaluatorConfig):
 C = TypeVar("C", bound=BaseLLMJudgeEvaluatorConfig)
 
 
-class LLMJudgeMixin(BaseEvaluator[T, C]):
+class LLMJudgeMixin(BaseEvaluator[T, C, str]):
     """Mixin that provides common LLM judge functionality."""
 
     system_prompt: str = LLMJudgePromptTemplates.LLM_JUDGE_SYSTEM_PROMPT
@@ -94,10 +94,13 @@ async def evaluate(
         )
 
         llm_response = await self._get_llm_response(evaluation_prompt)
+        validated_justification = self.validate_justification(
+            llm_response.justification
+        )
 
         return NumericEvaluationResult(
             score=round(llm_response.score / 100.0, 2),
-            details=llm_response.justification,
+            details=validated_justification,
         )
 
     def _create_evaluation_prompt(
 
@@ -41,7 +41,7 @@ class LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(LLMJudgeOutputEvaluatorC
 
 
 class BaseLLMOutputEvaluator(
-    OutputEvaluator[OC],
+    OutputEvaluator[OC, str],
     LLMJudgeMixin[OutputEvaluationCriteria, OC],
 ):
     """Base class for LLM judge output evaluators that contains all shared functionality.