@@ -174,14 +174,6 @@ class HallucinationDetectionOutput(LLMOutput):
174174 3. **Clearly contradicted by the documents** - Claims that directly contradict the documents → FLAG
175175 4. **Completely unsupported by the documents** - Claims that cannot be verified from the documents → FLAG
176176
177- Respond with a JSON object containing:
178- - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
179- - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
180- - "reasoning": string (detailed explanation of your analysis)
181- - "hallucination_type": string (type of issue, if detected: "factual_error", "unsupported_claim", or "none" if supported)
182- - "hallucinated_statements": array of strings (specific factual statements that may be hallucinated)
183- - "verified_statements": array of strings (specific factual statements that are supported by the documents)
184-
185177 **CRITICAL GUIDELINES**:
186178 - Flag content if ANY factual claims are unsupported or contradicted (even if some claims are supported)
187179 - Allow conversational, opinion-based, or general content to pass through
@@ -196,6 +188,30 @@ class HallucinationDetectionOutput(LLMOutput):
196188).strip ()
197189
198190
191+ # Instruction for output format when reasoning is enabled
192+ REASONING_OUTPUT_INSTRUCTION = textwrap .dedent (
193+ """
194+ Respond with a JSON object containing:
195+ - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
196+ - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
197+ - "reasoning": string (detailed explanation of your analysis)
198+ - "hallucination_type": string (type of issue, if detected: "factual_error", "unsupported_claim", or "none" if supported)
199+ - "hallucinated_statements": array of strings (specific factual statements that may be hallucinated)
200+ - "verified_statements": array of strings (specific factual statements that are supported by the documents)
201+ """
202+ ).strip ()
203+
204+
205+ # Instruction for output format when reasoning is disabled
206+ BASE_OUTPUT_INSTRUCTION = textwrap .dedent (
207+ """
208+ Respond with a JSON object containing:
209+ - "flagged": boolean (true if ANY factual claims are clearly contradicted or completely unsupported)
210+ - "confidence": float (0.0 to 1.0, your confidence that the input is hallucinated)
211+ """
212+ ).strip ()
213+
214+
199215async def hallucination_detection (
200216 ctx : GuardrailLLMContextProto ,
201217 candidate : str ,
@@ -232,11 +248,16 @@ async def hallucination_detection(
232248 )
233249
234250 try :
235- # Create the validation query
236- validation_query = f"{ VALIDATION_PROMPT } \n \n Text to validate:\n { candidate } "
237-
238- # Use HallucinationDetectionOutput (with reasoning fields) if enabled, otherwise base LLMOutput
239- output_format = HallucinationDetectionOutput if config .include_reasoning else LLMOutput
251+ # Build the prompt based on whether reasoning is requested
252+ if config .include_reasoning :
253+ output_instruction = REASONING_OUTPUT_INSTRUCTION
254+ output_format = HallucinationDetectionOutput
255+ else :
256+ output_instruction = BASE_OUTPUT_INSTRUCTION
257+ output_format = LLMOutput
258+
259+ # Create the validation query with appropriate output instructions
260+ validation_query = f"{ VALIDATION_PROMPT } \n \n { output_instruction } \n \n Text to validate:\n { candidate } "
240261
241262 # Use the Responses API with file search and structured output
242263 response = await _invoke_openai_callable (
0 commit comments