From 72d2f523decb7775825fe0ca88ea7ca91a284883 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 11:35:22 +0200
Subject: [PATCH 1/7] deepeval_redteam_tests

---
 .github/workflows/deepeval-tests.yml          | 234 ++++-
 .github/workflows/deepteam-red-team-tests.yml | 189 +++-
 .gitleaks.toml                                |   4 +
 docker-compose-test.yml                       | 290 +++++++
 pyproject.toml                                |   5 +-
 src/llm_orchestration_service.py              |  25 +-
 src/llm_orchestration_service_api.py          |  76 ++
 src/models/request_models.py                  |  16 +
 tests/conftest.py                             | 804 ++++++++++++++++++
 tests/data/test_dataset.json                  | 207 ++---
 tests/deepeval_tests/red_team_tests.py        |  51 +-
 tests/deepeval_tests/report_generator.py      |  10 +-
 tests/deepeval_tests/standard_tests.py        | 143 ++--
 tests/helpers/test_data_loader.py             | 174 ++++
 tests/mocks/__init__.py                       |   0
 tests/mocks/dummy_llm_orchestrator.py         | 274 ------
 uv.lock                                       |  32 +-
 17 files changed, 2012 insertions(+), 522 deletions(-)
 create mode 100644 .gitleaks.toml
 create mode 100644 docker-compose-test.yml
 create mode 100644 tests/conftest.py
 create mode 100644 tests/helpers/test_data_loader.py
 delete mode 100644 tests/mocks/__init__.py
 delete mode 100644 tests/mocks/dummy_llm_orchestrator.py

diff --git a/.github/workflows/deepeval-tests.yml b/.github/workflows/deepeval-tests.yml
index 5da84df..22fef17 100644
--- a/.github/workflows/deepeval-tests.yml
+++ b/.github/workflows/deepeval-tests.yml
@@ -3,52 +3,227 @@ name: DeepEval RAG System Tests
 on:
   pull_request:
     types: [opened, synchronize, reopened]
+    branches: ["rag-33-debug", "RAG-33-31okt"]
     paths:
       - 'src/**'
       - 'tests/**'
+      - 'data/**'
+      - 'docker-compose-test.yml'
+      - 'Dockerfile.llm_orchestration_service'
       - '.github/workflows/deepeval-tests.yml'
 
 jobs:
   deepeval-tests:
     runs-on: ubuntu-latest
-    timeout-minutes: 40
+    timeout-minutes: 80
     
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
-        
+      
+      - name: Validate required secrets
+        id: validate_secrets
+        run: |
+          echo "Validating required environment variables..."
+          MISSING_SECRETS=()
+          
+          # Check Azure OpenAI secrets
+          if [ -z "${{ secrets.AZURE_OPENAI_ENDPOINT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_ENDPOINT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_API_KEY }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_API_KEY")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_DEPLOYMENT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_EMBEDDING_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
+          fi
+
+          if [ -z "${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_DEEPEVAL_DEPLOYMENT")
+          fi
+          
+
+          
+          if [ -z "${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_CONNECTION_STRING")
+          fi
+
+          if [ -z "${{ secrets.AZURE_STORAGE_CONTAINER_NAME }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_CONTAINER_NAME")
+          fi
+
+          if [ -z "${{ secrets.AZURE_STORAGE_BLOB_NAME }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_BLOB_NAME")
+          fi
+
+
+          # If any secrets are missing, fail
+          if [ ${#MISSING_SECRETS[@]} -gt 0 ]; then
+            echo "missing=true" >> $GITHUB_OUTPUT
+            echo "secrets_list=${MISSING_SECRETS[*]}" >> $GITHUB_OUTPUT
+            echo " Missing required secrets: ${MISSING_SECRETS[*]}"
+            exit 1
+          else
+            echo "missing=false" >> $GITHUB_OUTPUT
+            echo " All required secrets are configured"
+          fi
+      
+      - name: Comment PR with missing secrets error
+        if: failure() && steps.validate_secrets.outputs.missing == 'true'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const missingSecrets = '${{ steps.validate_secrets.outputs.secrets_list }}'.split(' ');
+            const secretsList = missingSecrets.map(s => `- \`${s}\``).join('\n');
+            
+            const comment = `##  DeepEval Tests: Missing Required Secrets
+            
+            The DeepEval RAG system tests cannot run because the following GitHub secrets are not configured:
+            
+            ${secretsList}
+            
+            ### How to Fix
+            
+            1. Go to **Settings** → **Secrets and variables** → **Actions**
+            2. Add the missing secrets with the appropriate values:
+            
+            **Azure OpenAI Configuration:**
+            - \`AZURE_OPENAI_ENDPOINT\` - Your Azure OpenAI resource endpoint (e.g., \`https://your-resource.openai.azure.com/\`)
+            - \`AZURE_OPENAI_API_KEY\` - Your Azure OpenAI API key
+            - \`AZURE_OPENAI_DEPLOYMENT\` - Chat model deployment name (e.g., \`gpt-4o-mini\`)
+            - \`AZURE_OPENAI_EMBEDDING_DEPLOYMENT\` - Embedding model deployment name (e.g., \`text-embedding-3-large\`)
+            - \`AZURE_STORAGE_CONNECTION_STRING\` - Connection string for Azure Blob Storage
+            - \`AZURE_STORAGE_CONTAINER_NAME\` - Container name in Azure Blob Storage
+            - \`AZURE_STORAGE_BLOB_NAME\` - Blob name for dataset in Azure
+            - \`AZURE_OPENAI_DEEPEVAL_DEPLOYMENT\` - DeepEval model deployment name (e.g., \`gpt-4.1\`)
+            
+            3. Re-run the workflow after adding the secrets
+            
+            ### Note
+            Tests will not run until all required secrets are configured.
+            
+            ---
+            *Workflow: ${context.workflow} | Run: [#${context.runNumber}](${context.payload.repository.html_url}/actions/runs/${context.runId})*`;
+            
+            // Find existing comment
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+            
+            const existingComment = comments.data.find(
+              comment => comment.user.login === 'github-actions[bot]' &&
+                comment.body.includes('DeepEval Tests: Missing Required Secrets')
+            );
+            
+            if (existingComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existingComment.id,
+                body: comment
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: comment
+              });
+            }
+      
       - name: Set up Python
+        if: success()
         uses: actions/setup-python@v5
         with:
           python-version-file: '.python-version'
-          
+      
       - name: Set up uv
+        if: success()
         uses: astral-sh/setup-uv@v6
-        
+      
       - name: Install dependencies (locked)
+        if: success()
         run: uv sync --frozen
-        
-      - name: Run DeepEval tests
+   
+      - name: Create test directories with proper permissions
+        if: success()
+        run: |
+          mkdir -p test-vault/agents/llm
+          mkdir -p test-vault/agent-out
+          # Set ownership to current user and make writable
+          sudo chown -R $(id -u):$(id -g) test-vault
+          chmod -R 777 test-vault
+          # Ensure the agent-out directory is world-readable after writes
+          sudo chmod -R a+rwX test-vault/agent-out
+
+      - name: Set up Deepeval with azure 
+        if: success()
+        run: |
+          uv run deepeval set-azure-openai \
+          --openai-endpoint "${{ secrets.AZURE_OPENAI_ENDPOINT }}" \
+          --openai-api-key "${{ secrets.AZURE_OPENAI_API_KEY }}" \
+          --deployment-name "${{ secrets.AZURE_OPENAI_DEPLOYMENT }}" \
+          --openai-model-name "${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}" \
+          --openai-api-version="2024-12-01-preview"
+
+      - name: Run DeepEval tests with testcontainers
+        if: success()
         id: run_tests
+        continue-on-error: true 
         env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        run: uv run python -m pytest tests/deepeval_tests/standard_tests.py -v --tb=short
-          
+          # LLM API Keys
+          AZURE_OPENAI_DEEPEVAL_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}
+          # Azure OpenAI - Chat Model
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }}
+          # Azure OpenAI - Embedding Model
+          AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_EMBEDDING_DEPLOYMENT }}
+          # Evaluation mode
+          AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
+          AZURE_STORAGE_CONTAINER_NAME: ${{ secrets.AZURE_STORAGE_CONTAINER_NAME }}
+          AZURE_STORAGE_BLOB_NAME: ${{ secrets.AZURE_STORAGE_BLOB_NAME }}
+          EVAL_MODE: "true"
+        run: |
+          # Run tests with testcontainers managing Docker Compose
+          uv run python -m pytest tests/deepeval_tests/standard_tests.py -v --tb=short --log-cli-level=INFO
+      
+      - name: Fix permissions on test artifacts
+        if: always()
+        run: |
+          sudo chown -R $(id -u):$(id -g) test-vault || true
+          sudo chmod -R a+rX test-vault || true
+      
       - name: Generate evaluation report
         if: always()
-        run: python tests/deepeval_tests/report_generator.py
-          
+        run: uv run python tests/deepeval_tests/report_generator.py
+      
+      - name: Save test artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: test-results
+          path: |
+            pytest_captured_results.json
+            test_report.md
+          retention-days: 30
+      
       - name: Comment PR with test results
         if: always() && github.event_name == 'pull_request'
         uses: actions/github-script@v7
         with:
           script: |
             const fs = require('fs');
-            
             try {
               const reportContent = fs.readFileSync('test_report.md', 'utf8');
-              
               const comments = await github.rest.issues.listComments({
                 owner: context.repo.owner,
                 repo: context.repo.repo,
@@ -57,7 +232,7 @@ jobs:
               
               const existingComment = comments.data.find(
                 comment => comment.user.login === 'github-actions[bot]' &&
-                comment.body.includes('RAG System Evaluation Report')
+                  comment.body.includes('RAG System Evaluation Report')
               );
               
               if (existingComment) {
@@ -75,10 +250,8 @@ jobs:
                   body: reportContent
                 });
               }
-              
             } catch (error) {
               console.error('Failed to post test results:', error);
-              
               await github.rest.issues.createComment({
                 issue_number: context.issue.number,
                 owner: context.repo.owner,
@@ -86,25 +259,26 @@ jobs:
                 body: `## RAG System Evaluation Report\n\n**Error generating test report**\n\nFailed to read or post test results. Check workflow logs for details.\n\nError: ${error.message}`
               });
             }
-            
+      
       - name: Check test results and fail if needed
         if: always()
         run: |
-           # Check if pytest ran (look at step output)
-           if [ "${{ steps.run_tests.outcome }}" == "failure" ]; then
+          # Check if pytest ran (look at step output)
+          if [ "${{ steps.run_tests.outcome }}" == "failure" ]; then
             echo "Tests ran but failed - this is expected if RAG performance is below threshold"
-           fi  
-           if [ -f "pytest_captured_results.json" ]; then
+          fi
+          
+          if [ -f "pytest_captured_results.json" ]; then
             total_tests=$(jq '.total_tests // 0' pytest_captured_results.json)
             passed_tests=$(jq '.passed_tests // 0' pytest_captured_results.json)
-
+            
             if [ "$total_tests" -eq 0 ]; then
               echo "ERROR: No tests were executed"
               exit 1
             fi
-
+            
             pass_rate=$(awk "BEGIN {print ($passed_tests / $total_tests) * 100}")
-
+            
             echo "DeepEval Test Results:"
             echo "Total Tests: $total_tests"
             echo "Passed Tests: $passed_tests"
@@ -117,7 +291,13 @@ jobs:
             else
               echo "TEST SUCCESS: Pass rate $pass_rate% meets threshold 70%"
             fi
-           else
+          else
             echo "ERROR: No test results file found"
             exit 1
-           fi
\ No newline at end of file
+          fi
+      
+      - name: Cleanup Docker resources
+        if: always()
+        run: |
+          docker compose -f docker-compose-test.yml down -v --remove-orphans || true
+          docker system prune -f || true
\ No newline at end of file
diff --git a/.github/workflows/deepteam-red-team-tests.yml b/.github/workflows/deepteam-red-team-tests.yml
index ba0861b..3c4d558 100644
--- a/.github/workflows/deepteam-red-team-tests.yml
+++ b/.github/workflows/deepteam-red-team-tests.yml
@@ -6,8 +6,9 @@ on:
     paths:
       - 'src/**'
       - 'tests/**'
-      - 'mocks/**'
       - 'data/**'
+      - 'docker-compose-test.yml'
+      - 'Dockerfile.llm_orchestration_service'
       - '.github/workflows/deepeval-red-team-tests.yml'
   workflow_dispatch:
     inputs:
@@ -24,32 +25,188 @@ on:
 jobs:
   security-assessment:
     runs-on: ubuntu-latest
-    timeout-minutes: 60
+    timeout-minutes: 90
 
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
 
+      - name: Validate required secrets
+        id: validate_secrets
+        run: |
+          echo "Validating required environment variables..."
+          MISSING_SECRETS=()
+          
+          # Check Azure OpenAI secrets
+          if [ -z "${{ secrets.AZURE_OPENAI_ENDPOINT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_ENDPOINT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_API_KEY }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_API_KEY")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_DEPLOYMENT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_EMBEDDING_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}" ]; then
+            MISSING_SECRETS+=("AZURE_OPENAI_DEEPEVAL_DEPLOYMENT")
+          fi
+          
+          if [ -z "${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_CONNECTION_STRING")
+          fi
+
+          if [ -z "${{ secrets.AZURE_STORAGE_CONTAINER_NAME }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_CONTAINER_NAME")
+          fi
+
+          if [ -z "${{ secrets.AZURE_STORAGE_BLOB_NAME }}" ]; then
+            MISSING_SECRETS+=("AZURE_STORAGE_BLOB_NAME")
+          fi
+          # If any secrets are missing, fail
+          if [ ${#MISSING_SECRETS[@]} -gt 0 ]; then
+            echo "missing=true" >> $GITHUB_OUTPUT
+            echo "secrets_list=${MISSING_SECRETS[*]}" >> $GITHUB_OUTPUT
+            echo " Missing required secrets: ${MISSING_SECRETS[*]}"
+            exit 1
+          else
+            echo "missing=false" >> $GITHUB_OUTPUT
+            echo " All required secrets are configured"
+          fi
+      
+      - name: Comment PR with missing secrets error
+        if: failure() && steps.validate_secrets.outputs.missing == 'true' && github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const missingSecrets = '${{ steps.validate_secrets.outputs.secrets_list }}'.split(' ');
+            const secretsList = missingSecrets.map(s => `- \`${s}\``).join('\n');
+            
+            const comment = `##  Red Team Security Tests: Missing Required Secrets
+            
+            The Red Team security assessment cannot run because the following GitHub secrets are not configured:
+            
+            ${secretsList}
+            
+            ### How to Fix
+            
+            1. Go to **Settings** → **Secrets and variables** → **Actions**
+            2. Add the missing secrets with the appropriate values:
+            
+            **Azure OpenAI Configuration:**
+            - \`AZURE_OPENAI_ENDPOINT\` - Your Azure OpenAI resource endpoint (e.g., \`https://your-resource.openai.azure.com/\`)
+            - \`AZURE_OPENAI_API_KEY\` - Your Azure OpenAI API key
+            - \`AZURE_OPENAI_DEPLOYMENT\` - Chat model deployment name (e.g., \`gpt-4o-mini\`)
+            - \`AZURE_OPENAI_EMBEDDING_DEPLOYMENT\` - Embedding model deployment name (e.g., \`text-embedding-3-large\`)
+            - \`AZURE_STORAGE_CONNECTION_STRING\` - Connection string for Azure Blob Storage
+            - \`AZURE_STORAGE_CONTAINER_NAME\` - Container name in Azure Blob Storage
+            - \`AZURE_STORAGE_BLOB_NAME\` - Blob name for dataset in Azure
+            - \`AZURE_OPENAI_DEEPEVAL_DEPLOYMENT\` - DeepEval model deployment name (e.g., \`gpt-4.1\`)
+            
+            3. Re-run the workflow after adding the secrets
+            
+            ###  Security Note
+            Without proper API credentials, we cannot assess the system's security posture against adversarial attacks.
+            
+            ---
+            *Workflow: ${context.workflow} | Run: [#${context.runNumber}](${context.payload.repository.html_url}/actions/runs/${context.runId})*`;
+            
+            // Find existing comment
+            const comments = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+            
+            const existingComment = comments.data.find(
+              comment => comment.user.login === 'github-actions[bot]' &&
+                comment.body.includes('Red Team Security Tests: Missing Required Secrets')
+            );
+            
+            if (existingComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existingComment.id,
+                body: comment
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: comment
+              });
+            }
+
       - name: Set up Python
+        if: success()
         uses: actions/setup-python@v5
         with:
           python-version-file: '.python-version'
 
       - name: Set up uv
+        if: success()
         uses: astral-sh/setup-uv@v6
 
       - name: Install dependencies (locked)
+        if: success()
         run: uv sync --frozen
 
-      - name: Run Complete Security Assessment
+      - name: Create test directories with proper permissions
+        if: success()
+        run: |
+          mkdir -p test-vault/agents/llm
+          mkdir -p test-vault/agent-out
+          # Set ownership to current user and make writable
+          sudo chown -R $(id -u):$(id -g) test-vault
+          chmod -R 777 test-vault
+          # Ensure the agent-out directory is world-readable after writes
+          sudo chmod -R a+rwX test-vault/agent-out
+
+      - name: Set up Deepeval with azure 
+        if: success()
+        run: |
+          uv run deepeval set-azure-openai \
+          --openai-endpoint "${{ secrets.AZURE_OPENAI_ENDPOINT }}" \
+          --openai-api-key "${{ secrets.AZURE_OPENAI_API_KEY }}" \
+          --deployment-name "${{ secrets.AZURE_OPENAI_DEPLOYMENT }}" \
+          --openai-model-name "${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}" \
+          --openai-api-version="2024-12-01-preview"
+
+
+      - name: Run Red Team Security Tests with testcontainers
+        if: success()
         id: run_tests
         continue-on-error: true
         env:
-          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          # LLM API Keys
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
+          AZURE_OPENAI_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }}
+          AZURE_OPENAI_DEEPEVAL_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_DEEPEVAL_DEPLOYMENT }}
+          # Azure OpenAI - Embedding Model
+          AZURE_OPENAI_EMBEDDING_DEPLOYMENT: ${{ secrets.AZURE_OPENAI_EMBEDDING_DEPLOYMENT }}
+          AZURE_STORAGE_CONNECTION_STRING: ${{ secrets.AZURE_STORAGE_CONNECTION_STRING }}
+          AZURE_STORAGE_CONTAINER_NAME: ${{ secrets.AZURE_STORAGE_CONTAINER_NAME }}
+          AZURE_STORAGE_BLOB_NAME: ${{ secrets.AZURE_STORAGE_BLOB_NAME }}
+          # Evaluation mode
+          EVAL_MODE: "true"
         run: |
-          # Run all security tests in one comprehensive session
-          uv run python -m pytest tests/deepeval_tests/red_team_tests.py::TestRAGSystemRedTeaming -v --tb=short
+          # Run tests with testcontainers managing Docker Compose
+          uv run python -m pytest tests/deepeval_tests/red_team_tests.py::TestRAGSystemRedTeaming -v --tb=short --log-cli-level=INFO
+
+      - name: Fix permissions on test artifacts
+        if: always()
+        run: |
+          sudo chown -R $(id -u):$(id -g) test-vault || true
+          sudo chmod -R a+rX test-vault || true
 
       - name: Generate Security Report
         if: always()
@@ -58,6 +215,16 @@ jobs:
             uv run python tests/deepeval_tests/red_team_report_generator.py || true
           fi
 
+      - name: Save test artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: security-test-results
+          path: |
+            pytest_captured_results.json
+            security_report.md
+          retention-days: 30
+
       - name: Comment PR with Security Results
         if: always() && github.event_name == 'pull_request'
         uses: actions/github-script@v7
@@ -164,4 +331,10 @@ jobs:
           else
             echo "ERROR: No test results file found"
             exit 1
-          fi
\ No newline at end of file
+          fi
+
+      - name: Cleanup Docker resources
+        if: always()
+        run: |
+          docker compose -f docker-compose-test.yml down -v --remove-orphans || true
+          docker system prune -f || true
\ No newline at end of file
diff --git a/.gitleaks.toml b/.gitleaks.toml
new file mode 100644
index 0000000..87311e3
--- /dev/null
+++ b/.gitleaks.toml
@@ -0,0 +1,4 @@
+[allowlist]
+paths = [
+  '''docker-compose-test\.yml'''
+]
\ No newline at end of file
diff --git a/docker-compose-test.yml b/docker-compose-test.yml
new file mode 100644
index 0000000..525419e
--- /dev/null
+++ b/docker-compose-test.yml
@@ -0,0 +1,290 @@
+services:
+  # === Core Infrastructure ===
+  
+  # Shared PostgreSQL database (used by both application and Langfuse)
+  rag_search_db:
+    image: postgres:14.1
+    container_name: rag_search_db
+    restart: always
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: dbadmin
+      POSTGRES_DB: rag-search
+    volumes:
+      - test_rag_search_db:/var/lib/postgresql/data
+    ports:
+      - "5436:5432"
+    networks:
+      - test-network
+
+  # Vector database for RAG
+  qdrant:
+    image: qdrant/qdrant:v1.15.1
+    container_name: qdrant
+    restart: always
+    ports:
+      - "6333:6333"
+      - "6334:6334"
+    volumes:
+      - test_qdrant_data:/qdrant/storage
+    networks:
+      - test-network
+
+  # === Secret Management ===
+  
+  # Vault - Secret management (dev mode)
+  vault:
+    image: hashicorp/vault:1.20.3
+    container_name: vault
+    cap_add:
+      - IPC_LOCK
+    ports:
+      - "8200:8200"
+    environment:
+      VAULT_DEV_ROOT_TOKEN_ID: root
+      VAULT_ADDR: http://0.0.0.0:8200
+      VAULT_API_ADDR: http://0.0.0.0:8200
+    command: server -dev -dev-listen-address=0.0.0.0:8200
+    networks:
+      - test-network
+
+  # Vault Agent - Automatic token management via AppRole
+  vault-agent-llm:
+    image: hashicorp/vault:1.20.3
+    container_name: vault-agent-llm
+    depends_on:
+      - vault
+    volumes:
+      - ./test-vault/agents/llm:/agent/in
+      - ./test-vault/agent-out:/agent/out
+    entrypoint: ["sh", "-c"]
+    command:
+      - |
+        # Wait for Vault to be ready
+        sleep 5
+        echo "Waiting for AppRole credentials..."
+        while [ ! -f /agent/in/role_id ] || [ ! -s /agent/in/role_id ]; do
+          sleep 1
+        done
+        while [ ! -f /agent/in/secret_id ] || [ ! -s /agent/in/secret_id ]; do
+          sleep 1
+        done
+        echo "Credentials found, starting Vault Agent..."
+        exec vault agent -config=/agent/in/agent.hcl -log-level=debug
+    networks:
+      - test-network
+
+  # === Langfuse Observability Stack ===
+  
+  # Redis - Queue and cache for Langfuse
+  redis:
+    image: redis:7
+    container_name: redis
+    restart: always
+    command: --requirepass myredissecret
+    ports:
+      - "127.0.0.1:6378:6379"
+    networks:
+      - test-network
+
+  # MinIO - S3-compatible storage for Langfuse
+  minio:
+    image: minio/minio:latest
+    container_name: minio
+    restart: always
+    entrypoint: sh
+    command: -c "mkdir -p /data/langfuse && minio server /data --address ':9000' --console-address ':9001'"
+    environment:
+      MINIO_ROOT_USER: minio
+      MINIO_ROOT_PASSWORD: miniosecret
+    ports:
+      - "9090:9000"
+      - "127.0.0.1:9091:9001"
+    volumes:
+      - test_minio_data:/data
+    networks:
+      - test-network
+
+  # ClickHouse - Analytics database for Langfuse (REQUIRED in v3)
+  clickhouse:
+    image: clickhouse/clickhouse-server:24.3
+    container_name: clickhouse
+    restart: always
+    environment:
+      CLICKHOUSE_DB: default
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+    volumes:
+      - test_clickhouse_data:/var/lib/clickhouse
+    ports:
+      - "127.0.0.1:8123:8123"
+      - "127.0.0.1:9000:9000"
+    networks:
+      - test-network
+    ulimits:
+      nofile:
+        soft: 262144
+        hard: 262144
+
+  # Langfuse Worker - Background job processor
+  langfuse-worker:
+    image: langfuse/langfuse-worker:3
+    container_name: langfuse-worker
+    restart: always
+    depends_on:
+      - rag_search_db
+      - minio
+      - redis
+      - clickhouse
+    ports:
+      - "127.0.0.1:3030:3030"
+    environment:
+      # Database
+      DATABASE_URL: postgresql://postgres:dbadmin@rag_search_db:5432/rag-search
+      
+      # Auth & Security (TEST VALUES ONLY - NOT FOR PRODUCTION)
+      # gitleaks:allow - These are test-only hex strings
+      NEXTAUTH_URL: http://localhost:3000
+      SALT: ef9d6c6f8b4a5e2c1d3f7a9b8c5e4d2a1f6b8c9d4e5f7a8b1c2d3e4f5a6b7c8d
+      ENCRYPTION_KEY: 1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b
+      
+      # Features
+      TELEMETRY_ENABLED: "false"
+      LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: "false"
+      
+      # ClickHouse (REQUIRED for Langfuse v3)
+      CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000/default
+      CLICKHOUSE_URL: http://clickhouse:8123
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+      CLICKHOUSE_CLUSTER_ENABLED: "false"
+      
+      # S3/MinIO Event Upload
+      LANGFUSE_S3_EVENT_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_EVENT_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # S3/MinIO Media Upload
+      LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_MEDIA_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # Redis
+      REDIS_HOST: redis
+      REDIS_PORT: "6379"
+      REDIS_AUTH: myredissecret
+    networks:
+      - test-network
+
+  # Langfuse Web - UI and API
+  langfuse-web:
+    image: langfuse/langfuse:3
+    container_name: langfuse-web
+    restart: always
+    depends_on:
+      - langfuse-worker
+      - rag_search_db
+      - clickhouse
+    ports:
+      - "3000:3000"
+    environment:
+      # Database
+      DATABASE_URL: postgresql://postgres:dbadmin@rag_search_db:5432/rag-search
+      
+      # Auth & Security (TEST VALUES ONLY - NOT FOR PRODUCTION)
+      # gitleaks:allow - These are test-only hex strings
+      NEXTAUTH_URL: http://localhost:3000
+      NEXTAUTH_SECRET: 9f8e7d6c5b4a3f2e1d0c9b8a7f6e5d4c3b2a1f0e9d8c7b6a5f4e3d2c1b0a9f8e
+      SALT: ef9d6c6f8b4a5e2c1d3f7a9b8c5e4d2a1f6b8c9d4e5f7a8b1c2d3e4f5a6b7c8d
+      ENCRYPTION_KEY: 1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b
+      
+      # Features
+      TELEMETRY_ENABLED: "false"
+      LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: "false"
+      
+      # ClickHouse (REQUIRED for Langfuse v3)
+      CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000/default
+      CLICKHOUSE_URL: http://clickhouse:8123
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+      CLICKHOUSE_CLUSTER_ENABLED: "false"
+      
+      # S3/MinIO Event Upload
+      LANGFUSE_S3_EVENT_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_EVENT_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # S3/MinIO Media Upload
+      LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_MEDIA_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # Redis
+      REDIS_HOST: redis
+      REDIS_PORT: "6379"
+      REDIS_AUTH: myredissecret
+      
+      # Initialize test project with known credentials
+      LANGFUSE_INIT_PROJECT_PUBLIC_KEY: pk-lf-test
+      LANGFUSE_INIT_PROJECT_SECRET_KEY: sk-lf-test
+    networks:
+      - test-network
+
+  # === LLM Orchestration Service ===
+  
+  llm-orchestration-service:
+    build:
+      context: .
+      dockerfile: Dockerfile.llm_orchestration_service
+    container_name: llm-orchestration-service
+    restart: always
+    ports:
+      - "8100:8100"
+    environment:
+      - VAULT_ADDR=http://vault:8200
+      - VAULT_TOKEN_FILE=/agent/out/token
+      - QDRANT_URL=http://qdrant:6333
+      - EVAL_MODE=true
+    volumes:
+      - ./src/llm_config_module/config:/app/src/llm_config_module/config:ro
+      - ./test-vault/agent-out:/agent/out:ro
+      - test_llm_orchestration_logs:/app/logs
+    depends_on:
+      - qdrant
+      - langfuse-web
+      - vault-agent-llm
+    networks:
+      - test-network
+
+# === Networks ===
+
+networks:
+  test-network:
+    name: test-network
+    driver: bridge
+
+# === Volumes ===
+
+volumes:
+  test_rag_search_db:
+    name: test_rag_search_db
+  test_qdrant_data:
+    name: test_qdrant_data
+  test_minio_data:
+    name: test_minio_data
+  test_clickhouse_data:
+    name: test_clickhouse_data
+  test_llm_orchestration_logs:
+    name: test_llm_orchestration_logs
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a2692fc..ee10f03 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,9 @@ dependencies = [
     "nemoguardrails>=0.16.0",
     "tiktoken>=0.11.0",
     "langfuse>=3.8.1",
+    "pytest-asyncio>=1.2.0",
+    "azure-storage-blob>=12.27.1",
+    "httpx>=0.28.1",
 ]
 
 [tool.ruff]
@@ -121,4 +124,4 @@ exclude = [
 ]
 
 # --- Global strictness ---
-typeCheckingMode = "standard"      # Standard typechecking mode
\ No newline at end of file
+typeCheckingMode = "standard"      # Standard typechecking mode
diff --git a/src/llm_orchestration_service.py b/src/llm_orchestration_service.py
index a6cc98e..8f661d9 100644
--- a/src/llm_orchestration_service.py
+++ b/src/llm_orchestration_service.py
@@ -2071,7 +2071,7 @@ def _generate_rag_response(
         No secondary LLM paths; no citations appended.
         """
         logger.info("Starting RAG response generation")
-
+        eval_mode = os.getenv("EVAL_MODE", "false").lower() == "true"
         if costs_dict is None:
             costs_dict = {}
 
@@ -2146,6 +2146,19 @@ def _generate_rag_response(
                     },
                     output=answer,
                 )
+            
+            retrieval_context: List[Dict[str, Any]] | None = None
+            if eval_mode and relevant_chunks:
+                max_blocks_used = ResponseGenerationConstants.DEFAULT_MAX_BLOCKS
+                chunks_used = relevant_chunks[:max_blocks_used]
+                retrieval_context = [
+                    {
+                        "content": chunk.get("text", ""),
+                        "score": chunk.get("score", 0.0),
+                        "metadata": chunk.get("meta", {}),
+                    }
+                    for chunk in chunks_used
+                ]
             if question_out_of_scope:
                 logger.info("Question determined out-of-scope – sending fixed message.")
 
@@ -2173,13 +2186,16 @@ def _generate_rag_response(
                         chunks=self._format_chunks_for_test_response(relevant_chunks),
                     )
                 else:
-                    return OrchestrationResponse(
+                    response =  OrchestrationResponse(
                         chatId=request.chatId,
                         llmServiceActive=True,  # service OK; insufficient context
                         questionOutOfLLMScope=True,
                         inputGuardFailed=False,
                         content=content_with_refs,
                     )
+                    if eval_mode: 
+                        response.retrieval_context = retrieval_context
+                    return response                    
 
             # In-scope: return the answer as-is (NO citations)
             logger.info("Returning in-scope answer without citations.")
@@ -2204,13 +2220,16 @@ def _generate_rag_response(
                     chunks=self._format_chunks_for_test_response(relevant_chunks),
                 )
             else:
-                return OrchestrationResponse(
+                response =  OrchestrationResponse(
                     chatId=request.chatId,
                     llmServiceActive=True,
                     questionOutOfLLMScope=False,
                     inputGuardFailed=False,
                     content=content_with_refs,
                 )
+                if eval_mode: 
+                    response.retrieval_context = retrieval_context
+                return response
 
         except Exception as e:
             error_id = generate_error_id()
diff --git a/src/llm_orchestration_service_api.py b/src/llm_orchestration_service_api.py
index b58eac9..30a703a 100644
--- a/src/llm_orchestration_service_api.py
+++ b/src/llm_orchestration_service_api.py
@@ -40,6 +40,7 @@
     ContextGenerationRequest,
     ContextGenerationResponse,
     EmbeddingErrorResponse,
+    DeepEvalTestOrchestrationResponse
 )
 
 
@@ -718,6 +719,81 @@ async def get_available_embedding_models(
         )
 
 
+@app.post("/orchestrate-eval")
+def orchestrate_llm_request_eval(
+    http_request: Request,
+    request: OrchestrationRequest,
+) -> DeepEvalTestOrchestrationResponse:
+    """
+    Process LLM orchestration request with additional testing data.
+
+    This endpoint is only available when EVAL_MODE=true and returns
+    retrieval context and refined questions for DeepEval metrics evaluation.
+
+    Args:
+        http_request: FastAPI Request object for accessing app state
+        request: OrchestrationRequest containing user message and context
+
+    Returns:
+        OrchestrationResponse: Response with LLM output, status flags, and test data
+
+    Raises:
+        HTTPException: For processing errors or if not in testing mode
+    """
+    # Check if eval mode is enabled
+    eval_mode = os.getenv("EVAL_MODE", "false").lower() == "true"
+    if not eval_mode:
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Eval endpoint not available in production mode",
+        )
+
+    try:
+        logger.info(f"Received EVAL orchestration request for chatId: {request.chatId}")
+
+        if not hasattr(http_request.app.state, "orchestration_service"):
+            logger.error("Orchestration service not found in app state")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Service not initialized",
+            )
+
+        orchestration_service = http_request.app.state.orchestration_service
+        if orchestration_service is None:
+            logger.error("Orchestration service is None")
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Service not initialized",
+            )
+
+        # Process the request (will include test data due to EVAL_MODE env var)
+        response = orchestration_service.process_orchestration_request(request)
+
+        # Convert to test response with additional fields
+        test_response = DeepEvalTestOrchestrationResponse(
+            chatId=response.chatId,
+            llmServiceActive=response.llmServiceActive,
+            questionOutOfLLMScope=response.questionOutOfLLMScope,
+            inputGuardFailed=response.inputGuardFailed,
+            content=response.content,
+            retrieval_context=response.retrieval_context,
+            refined_questions=response.refined_questions,
+            expected_output=None,  # Will be populated by test framework
+        )
+
+        logger.info(f"Successfully processed TEST request for chatId: {request.chatId}")
+        return test_response
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error processing TEST request: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Internal server error occurred",
+        )
+
+
 if __name__ == "__main__":
     logger.info("Starting LLM Orchestration Service API server on port 8100")
     uvicorn.run(
diff --git a/src/models/request_models.py b/src/models/request_models.py
index f4a073c..539d708 100644
--- a/src/models/request_models.py
+++ b/src/models/request_models.py
@@ -148,6 +148,9 @@ class OrchestrationResponse(BaseModel):
     inputGuardFailed: bool = Field(
         ..., description="Whether input guard validation failed"
     )
+    retrieval_context: Optional[List[Dict[str, Any]]] = Field(
+        default=None, exclude=True
+    )
     content: str = Field(..., description="Response content with citations")
 
 
@@ -263,3 +266,16 @@ class TestOrchestrationResponse(BaseModel):
     chunks: Optional[List[ChunkInfo]] = Field(
         default=None, description="Retrieved chunks with rank and content"
     )
+
+
+class DeepEvalTestOrchestrationResponse(BaseModel):
+    """Extended response model for testing with additional evaluation data."""
+
+    chatId: str
+    llmServiceActive: bool
+    questionOutOfLLMScope: bool
+    inputGuardFailed: bool
+    content: str
+    retrieval_context: Optional[List[Dict[str, Any]]] = None
+    refined_questions: Optional[List[str]] = None
+    expected_output: Optional[str] = None  # For DeepEval
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..c589376
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,804 @@
+import os
+import time
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, Optional, Generator
+
+import pytest
+import hvac
+import requests
+from loguru import logger
+from testcontainers.compose import DockerCompose  # type: ignore
+from azure.storage.blob import BlobServiceClient
+
+
+# ===================== Azure Blob Storage Helper =====================
+
+
+def download_embeddings_from_azure(
+    connection_string: str,
+    container_name: str,
+    blob_name: str,
+    local_path: Path
+) -> None:
+    """
+    Download pre-computed embeddings from Azure Blob Storage.
+    
+    Args:
+        connection_string: Azure Storage connection string
+        container_name: Name of the blob container
+        blob_name: Name of the blob to download
+        local_path: Local path to save the downloaded file
+    """
+    logger.info("Downloading embeddings from Azure Blob Storage...")
+    logger.info(f"  Container: {container_name}")
+    logger.info(f"  Blob: {blob_name}")
+    logger.info(f"  Local path: {local_path}")
+    
+    try:
+        # Create BlobServiceClient
+        blob_service_client = BlobServiceClient.from_connection_string(connection_string)
+        
+        # Get blob client
+        blob_client = blob_service_client.get_blob_client(
+            container=container_name,
+            blob=blob_name
+        )
+        
+        # Ensure parent directory exists
+        local_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Download the blob
+        with open(local_path, "wb") as download_file:
+            download_stream = blob_client.download_blob()
+            download_file.write(download_stream.readall())
+        
+        file_size_kb = local_path.stat().st_size / 1024
+        logger.info(f"✓ Downloaded embeddings successfully ({file_size_kb:.2f} KB)")
+        
+    except Exception as e:
+        logger.error(f"Failed to download embeddings from Azure: {e}")
+        raise
+
+
+# ===================== VaultAgentClient =====================
+
+
+class VaultAgentClient:
+    """Client for interacting with Vault using a token written by Vault Agent"""
+
+    def __init__(
+        self,
+        vault_url: str,
+        token_path: Path = Path("test-vault/agent-out/token"),
+        mount_point: str = "secret",
+        timeout: int = 10,
+    ):
+        self.vault_url = vault_url
+        self.token_path = token_path
+        self.mount_point = mount_point
+
+        self.client = hvac.Client(url=self.vault_url, timeout=timeout)
+        self._load_token()
+
+    def _load_token(self) -> None:
+        """Load token from file written by Vault Agent"""
+        if not self.token_path.exists():
+            raise FileNotFoundError(f"Vault token file missing: {self.token_path}")
+        token = self.token_path.read_text().strip()
+        if not token:
+            raise ValueError("Vault token file is empty")
+        self.client.token = token
+
+    def is_authenticated(self) -> bool:
+        """Check if the current token is valid"""
+        try:
+            return self.client.is_authenticated()
+        except Exception as e:
+            logger.warning(f"Vault token is not valid: {e}")
+            return False
+
+    def is_vault_available(self) -> bool:
+        """Check if Vault is initialized and unsealed"""
+        try:
+            status = self.client.sys.read_health_status(method="GET")
+            return (
+                isinstance(status, dict)
+                and status.get("initialized", False)
+                and not status.get("sealed", True)
+            )
+        except Exception as e:
+            logger.warning(f"Vault availability check failed: {e}")
+            return False
+
+    def get_secret(self, path: str) -> dict:
+        """Read a secret from Vault KV v2"""
+        try:
+            result = self.client.secrets.kv.v2.read_secret_version(
+                path=path, mount_point=self.mount_point
+            )
+            return result["data"]["data"]
+        except Exception as e:
+            logger.error(f"Failed to read Vault secret at {path}: {e}")
+            raise
+
+
+# ===================== RAGStackTestContainers =====================
+
+
+class RAGStackTestContainers:
+    """Manages test containers for RAG stack including Vault, Qdrant, Langfuse, and LLM orchestration service"""
+
+    def __init__(self, compose_file_name: str = "docker-compose-test.yml"):
+        self.project_root = Path(__file__).parent.parent
+        self.compose_file_path = self.project_root / compose_file_name
+        self.compose: Optional[DockerCompose] = None
+        self.services_info: Dict[str, Dict[str, Any]] = {}
+
+        if not self.compose_file_path.exists():
+            raise FileNotFoundError(
+                f"Docker compose file not found: {self.compose_file_path}"
+            )
+
+    def start(self) -> None:
+        """Start all test containers and bootstrap Vault"""
+        logger.info("Starting RAG Stack testcontainers...")
+        os.environ["EVAL_MODE"] = "true"
+        
+        # Download embeddings from Azure before starting containers
+        self._download_embeddings_from_azure()
+        
+        # Prepare Vault Agent directories
+        agent_in = self.project_root / "test-vault" / "agents" / "llm"
+        agent_out = self.project_root / "test-vault" / "agent-out"
+        agent_in.mkdir(parents=True, exist_ok=True)
+        agent_out.mkdir(parents=True, exist_ok=True)
+
+        # Clean up any stale files from previous runs
+        for f in ["role_id", "secret_id", "token", "pidfile", "dummy"]:
+            (agent_in / f).unlink(missing_ok=True)
+            (agent_out / f).unlink(missing_ok=True)
+
+        # Start all Docker Compose services
+        logger.info("Starting Docker Compose services...")
+        self.compose = DockerCompose(
+            str(self.project_root),
+            compose_file_name=self.compose_file_path.name,
+            pull=False,
+        )
+        self.compose.start()
+
+        # Get Vault connection details
+        vault_url = self._get_vault_url()
+        logger.info(f"Vault URL: {vault_url}")
+
+        # Wait for Vault to be ready
+        self._wait_for_vault_ready(vault_url)
+
+        # Configure Vault with AppRole, policies, and test secrets
+        self._bootstrap_vault_dev(agent_in, vault_url)
+
+        # Verify credentials were written successfully
+        role_id = (agent_in / "role_id").read_text().strip()
+        secret_id = (agent_in / "secret_id").read_text().strip()
+        logger.info(
+            f"AppRole credentials written: role_id={role_id[:8]}..., secret_id={secret_id[:8]}..."
+        )
+
+        # Wait for Vault Agent to authenticate and write token
+        logger.info("Waiting for vault-agent to authenticate...")
+        self._wait_for_valid_token(agent_out / "token", vault_url, max_attempts=20)
+
+        logger.info("Vault Agent authenticated successfully")
+
+        # Wait for other services to be ready
+        self._wait_for_services()
+        self._collect_service_info()
+
+        # Index test data into Qdrant
+        self._index_test_data()
+
+        logger.info("RAG Stack testcontainers ready")
+
+    def stop(self) -> None:
+        """Stop all test containers"""
+        if self.compose:
+            logger.info("Stopping RAG Stack testcontainers...")
+            self.compose.stop()
+            logger.info("Testcontainers stopped")
+
+    def _download_embeddings_from_azure(self) -> None:
+        """Download embeddings from Azure Blob Storage if configured."""
+        connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
+        container_name = os.getenv("AZURE_STORAGE_CONTAINER_NAME", "test-embeddings")
+        blob_name = os.getenv("AZURE_STORAGE_BLOB_NAME", "test_embeddings.json")
+        
+        # Local path where embeddings should be saved
+        embeddings_file = self.project_root / "tests" / "data" / "test_embeddings.json"
+        
+        # Skip if embeddings already exist locally (for local development)
+        #if embeddings_file.exists():
+        #    logger.info("Embeddings file already exists locally, skipping Azure download")
+        #    return
+        
+        # Require Azure configuration for CI/CD
+        if not connection_string:
+            raise ValueError(
+                "AZURE_STORAGE_CONNECTION_STRING is required to download embeddings. "
+                "Either set this environment variable or ensure test_embeddings.json "
+                f"exists at {embeddings_file}"
+            )
+        
+        logger.info("=" * 80)
+        logger.info("DOWNLOADING EMBEDDINGS FROM AZURE BLOB STORAGE")
+        logger.info("=" * 80)
+        
+        try:
+            download_embeddings_from_azure(
+                connection_string=connection_string,
+                container_name=container_name,
+                blob_name=blob_name,
+                local_path=embeddings_file
+            )
+            logger.info("Embeddings download complete")
+        except Exception as e:
+            logger.error(f"Failed to download embeddings from Azure: {e}")
+            raise
+
+    def _get_vault_url(self) -> str:
+        """Get the mapped Vault URL accessible from the host"""
+        if not self.compose:
+            raise RuntimeError("Docker Compose not initialized")
+        host = self.compose.get_service_host("vault", 8200)
+        port = self.compose.get_service_port("vault", 8200)
+        return f"http://{host}:{port}"
+
+    def _wait_for_vault_ready(self, vault_url: str, timeout: int = 60) -> None:
+        """Wait for Vault to be initialized and unsealed"""
+        logger.info("Waiting for Vault to be available...")
+        client = hvac.Client(url=vault_url, token="root", timeout=10)
+
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                status = client.sys.read_health_status(method="GET")
+                if status.get("initialized", False) and not status.get("sealed", True):
+                    logger.info("Vault is available and unsealed")
+                    return
+            except Exception:
+                pass
+            time.sleep(2)
+
+        raise TimeoutError("Vault did not become available within 60s")
+
+    def _bootstrap_vault_dev(self, agent_in: Path, vault_url: str) -> None:
+        """
+        Bootstrap Vault dev instance with:
+        - AppRole auth method
+        - Policy for LLM orchestration service
+        - AppRole role and credentials
+        - Test secrets (LLM connections, Langfuse, embeddings, guardrails)
+        """
+        logger.info("Bootstrapping Vault with AppRole and test secrets...")
+        client = hvac.Client(url=vault_url, token="root")
+
+        # Enable AppRole authentication method
+        if "approle/" not in client.sys.list_auth_methods():
+            client.sys.enable_auth_method("approle")
+            logger.info("AppRole enabled")
+
+        # Create policy with permissions for all secret paths (updated with correct embedding paths)
+        policy = """
+path "secret/metadata/llm/*" { capabilities = ["list"] }
+path "secret/data/llm/*"     { capabilities = ["read"] }
+path "secret/metadata/langfuse/*" { capabilities = ["list"] }
+path "secret/data/langfuse/*"     { capabilities = ["read"] }
+path "secret/metadata/embeddings/*" { capabilities = ["list"] }
+path "secret/data/embeddings/*"     { capabilities = ["read"] }
+path "secret/metadata/guardrails/*" { capabilities = ["list"] }
+path "secret/data/guardrails/*"     { capabilities = ["read"] }
+path "auth/token/lookup-self" { capabilities = ["read"] }
+path "auth/token/renew-self" { capabilities = ["update"] }
+"""
+        client.sys.create_or_update_policy("llm-orchestration", policy)
+        logger.info("Policy 'llm-orchestration' created")
+
+        # Create AppRole role with service token type
+        role_name = "llm-orchestration-service"
+        client.write(
+            f"auth/approle/role/{role_name}",
+            **{
+                "token_policies": "llm-orchestration",
+                "secret_id_ttl": "24h",
+                "token_ttl": "1h",
+                "token_max_ttl": "24h",
+                "secret_id_num_uses": 0,
+                "bind_secret_id": True,
+                "token_no_default_policy": True,
+                "token_type": "service",
+            },
+        )
+        logger.info(f"AppRole '{role_name}' created")
+
+        # Generate credentials for the AppRole
+        role_id = client.read(f"auth/approle/role/{role_name}/role-id")["data"][
+            "role_id"
+        ]
+        secret_id = client.write(f"auth/approle/role/{role_name}/secret-id")["data"][
+            "secret_id"
+        ]
+
+        # Write credentials to files that Vault Agent will read
+        (agent_in / "role_id").write_text(role_id, encoding="utf-8")
+        (agent_in / "secret_id").write_text(secret_id, encoding="utf-8")
+        logger.info("AppRole credentials written to agent-in/")
+
+        # Write test secrets
+        self._write_test_secrets(client)
+
+    def _write_test_secrets(self, client: hvac.Client) -> None:
+        """Write all test secrets to Vault with correct path structure"""
+
+        # ============================================================
+        # CRITICAL DEBUG SECTION - Environment Variables
+        # ============================================================
+        logger.info("=" * 80)
+        logger.info("VAULT SECRET BOOTSTRAP - ENVIRONMENT VARIABLES DEBUG")
+        logger.info("=" * 80)
+
+        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+        azure_api_key = os.getenv("AZURE_OPENAI_API_KEY")
+        azure_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
+        azure_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
+
+        # Validate critical environment variables
+        missing_vars = []
+        if not azure_endpoint:
+            missing_vars.append("AZURE_OPENAI_ENDPOINT")
+        if not azure_api_key:
+            missing_vars.append("AZURE_OPENAI_API_KEY")
+        if not azure_embedding_deployment:
+            missing_vars.append("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
+
+        if missing_vars:
+            error_msg = f"CRITICAL: Missing required environment variables: {', '.join(missing_vars)}"
+            logger.error(error_msg)
+            raise ValueError(error_msg)
+
+        logger.info("All required environment variables are set")
+        logger.info("=" * 80)
+
+        # ============================================================
+        # CHAT MODEL SECRET (LLM path)
+        # ============================================================
+        logger.info("")
+        logger.info("Writing LLM connection secret (chat model)...")
+        llm_secret = {
+            "connection_id": "evalconnection-1",
+            "endpoint": azure_endpoint,
+            "api_key": azure_api_key,
+            "deployment_name": azure_deployment or "gpt-4o-mini",
+            "environment": "development",
+            "model": "gpt-4o-mini",
+            "model_type": "chat",
+            "api_version": "2024-02-15-preview",
+            "tags": "azure,test,chat",
+        }
+
+        logger.info(f"  → chat deployment: {llm_secret['deployment_name']}")
+        logger.info(f"  → endpoint: {llm_secret['endpoint']}")
+        logger.info(f"  → connection_id: {llm_secret['connection_id']}")
+
+        client.secrets.kv.v2.create_or_update_secret(
+            mount_point="secret",
+            path="llm/connections/azure_openai/development/evalconnection-1",
+            secret=llm_secret,
+        )
+        logger.info(
+            "LLM connection secret written to llm/connections/azure_openai/development/evalconnection-1"
+        )
+
+        # ============================================================
+        # EMBEDDING MODEL SECRET (Embeddings path)
+        # ============================================================
+        logger.info("")
+        logger.info("Writing embedding model secret...")
+        embedding_secret = {
+            "connection_id": "evalconnection-1",
+            "endpoint": azure_endpoint,
+            "api_key": azure_api_key,
+            "deployment_name": azure_embedding_deployment,  # This is the embedding deployment
+            "environment": "development",
+            "model": "text-embedding-3-large",
+            "model_type": "embedding",
+            "api_version": "2024-02-15-preview",
+            "max_tokens": 2048,
+            "vector_size": 3072,
+            "tags": "azure,embedding,test",
+        }
+
+        logger.info(f"  → model: {embedding_secret['model']}")
+        logger.info(f"  → connection_id: {embedding_secret['connection_id']}")
+        logger.info(
+            "  → Vault path: embeddings/connections/azure_openai/development/evalconnection-1"
+        )
+
+        # Write to embeddings path with connection_id in the path
+        client.secrets.kv.v2.create_or_update_secret(
+            mount_point="secret",
+            path="embeddings/connections/azure_openai/development/evalconnection-1",
+            secret=embedding_secret,
+        )
+        logger.info(
+            "Embedding secret written to embeddings/connections/azure_openai/development/evalconnection-1"
+        )
+
+        # ============================================================
+        # VERIFY SECRETS WERE WRITTEN CORRECTLY
+        # ============================================================
+        logger.info("")
+        logger.info("Verifying secrets in Vault...")
+        try:
+            # Verify LLM path
+            verify_llm = client.secrets.kv.v2.read_secret_version(
+                path="llm/connections/azure_openai/development/evalconnection-1",
+                mount_point="secret",
+            )
+            llm_data = verify_llm["data"]["data"]
+            logger.info("LLM path verified:")
+            logger.info(f"    • connection_id: {llm_data.get('connection_id')}")
+
+            # Verify embeddings path
+            verify_embedding = client.secrets.kv.v2.read_secret_version(
+                path="embeddings/connections/azure_openai/development/evalconnection-1",
+                mount_point="secret",
+            )
+            embedding_data = verify_embedding["data"]["data"]
+            logger.info("Embeddings path verified:")
+            logger.info(f"    • model: {embedding_data.get('model')}")
+            logger.info(f"    • connection_id: {embedding_data.get('connection_id')}")
+
+            # Critical validation
+            if embedding_data.get("deployment_name") != azure_embedding_deployment:
+                error_msg = (
+                    "VAULT SECRET MISMATCH! "
+                    f"Expected deployment_name='{azure_embedding_deployment}' "
+                    f"but Vault has '{embedding_data.get('deployment_name')}'"
+                )
+                logger.error(error_msg)
+                raise ValueError(error_msg)
+
+            if embedding_data.get("connection_id") != "evalconnection-1":
+                error_msg = (
+                    "VAULT SECRET MISMATCH! "
+                    "Expected connection_id='evalconnection-1' "
+                    f"but Vault has '{embedding_data.get('connection_id')}'"
+                )
+                logger.error(error_msg)
+                raise ValueError(error_msg)
+
+            logger.info("Secret verification PASSED")
+
+        except Exception as e:
+            logger.error(f"Failed to verify secrets: {e}")
+            raise
+
+        # ============================================================
+        # LANGFUSE CONFIGURATION
+        # ============================================================
+        logger.info("")
+        logger.info("Writing Langfuse configuration secret...")
+        langfuse_secret = {
+            "public_key": "pk-lf-test",
+            "secret_key": "sk-lf-test",
+            "host": "http://langfuse-web:3000",
+        }
+        client.secrets.kv.v2.create_or_update_secret(
+            mount_point="secret", path="langfuse/config", secret=langfuse_secret
+        )
+        logger.info("Langfuse configuration secret written")
+
+        logger.info("=" * 80)
+        logger.info("ALL SECRETS WRITTEN SUCCESSFULLY")
+        logger.info("=" * 80)
+
+    def _capture_service_logs(self) -> None:
+        """Capture logs from all services before cleanup."""
+        services = ["llm-orchestration-service", "vault", "qdrant", "langfuse-web"]
+
+        for service in services:
+            try:
+                logger.info(f"\n{'=' * 60}")
+                logger.info(f"LOGS: {service}")
+                logger.info("=" * 60)
+
+                result = subprocess.run(
+                    [
+                        "docker",
+                        "compose",
+                        "-f",
+                        str(self.compose_file_path),
+                        "logs",
+                        "--tail",
+                        "200",
+                        service,
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=10,
+                    cwd=str(self.project_root),
+                )
+
+                if result.stdout:
+                    logger.info(result.stdout)
+                if result.stderr:
+                    logger.error(result.stderr)
+
+            except Exception as e:
+                logger.error(f"Failed to capture logs for {service}: {e}")
+
+    def _wait_for_valid_token(
+        self, token_path: Path, vault_url: str, max_attempts: int = 20
+    ) -> None:
+        """Wait for Vault Agent to write a valid token and verify it works"""
+        for attempt in range(max_attempts):
+            if token_path.exists() and token_path.stat().st_size > 0:
+                try:
+                    # Fix permissions before reading
+                    self._fix_token_file_permissions(token_path)
+
+                    token = token_path.read_text().strip()
+
+                    client = hvac.Client(url=vault_url, token=token)
+                    try:
+                        client.lookup_token()
+
+                        if client.is_authenticated():
+                            logger.info(f"Valid token obtained (attempt {attempt + 1})")
+                            self._verify_token_permissions(client)
+                            return
+                    except Exception as e:
+                        if attempt < max_attempts - 1:
+                            logger.debug(
+                                f"Token validation error (attempt {attempt + 1}): {type(e).__name__}"
+                            )
+                except PermissionError as e:
+                    logger.warning(
+                        f"Permission error reading token file (attempt {attempt + 1}): {e}"
+                    )
+                    # Try to fix permissions again
+                    self._fix_token_file_permissions(token_path, force=True)
+
+            time.sleep(2)
+
+        logger.error("Failed to obtain valid Vault token")
+        self._check_agent_logs()
+        raise TimeoutError(
+            f"Failed to obtain valid Vault token after {max_attempts} attempts"
+        )
+
+    def _fix_token_file_permissions(
+        self, token_path: Path, force: bool = False
+    ) -> None:
+        """Fix permissions on token file to make it readable by host user"""
+        try:
+            # Try to change permissions using subprocess (requires Docker to be accessible)
+            if force:
+                logger.info(
+                    "Attempting to fix token file permissions using docker exec..."
+                )
+                result = subprocess.run(
+                    [
+                        "docker",
+                        "exec",
+                        "vault-agent-llm",
+                        "chmod",
+                        "644",
+                        "/agent/out/token",
+                    ],
+                    capture_output=True,
+                    text=True,
+                    timeout=5,
+                )
+                if result.returncode == 0:
+                    logger.info(
+                        "Successfully fixed token file permissions via docker exec"
+                    )
+                else:
+                    logger.warning(
+                        f"Failed to fix permissions via docker exec: {result.stderr}"
+                    )
+
+            # Also try direct chmod (may not work in all environments)
+            try:
+                os.chmod(token_path, 0o644)
+            except Exception as chmod_error:
+                logger.debug(
+                    f"Direct chmod failed (expected in some environments): {chmod_error}"
+                )
+
+        except Exception as e:
+            logger.debug(f"Could not fix token file permissions: {e}")
+
+    def _verify_token_permissions(self, client: hvac.Client) -> None:
+        """Verify the token has correct permissions to read secrets"""
+        try:
+            client.secrets.kv.v2.read_secret_version(
+                path="llm/connections/azure_openai/development/evalconnection-1",
+                mount_point="secret",
+            )
+            logger.info("Token has correct permissions to read secrets")
+        except Exception as e:
+            logger.error(f"Token cannot read secrets: {e}")
+            raise
+
+    def _check_agent_logs(self) -> None:
+        """Check vault-agent logs for debugging authentication issues"""
+        result = subprocess.run(
+            ["docker", "logs", "--tail", "50", "vault-agent-llm"],
+            capture_output=True,
+            text=True,
+        )
+        logger.error(f"Vault Agent Logs:\n{result.stdout}\n{result.stderr}")
+
+    def _wait_for_services(self, total_timeout: int = 300) -> None:
+        """Wait for all services to be healthy"""
+        services = [
+            ("qdrant", 6333, self._check_qdrant, 60),
+            ("langfuse-web", 3000, self._check_langfuse, 120),
+            ("llm-orchestration-service", 8100, self._check_orchestration, 180),
+        ]
+        start = time.time()
+        for name, port, check, timeout in services:
+            self._wait_single(name, port, check, timeout, start, total_timeout)
+
+    def _wait_single(
+        self,
+        name: str,
+        port: int,
+        check: Any,
+        timeout: int,
+        global_start: float,
+        total_timeout: int,
+    ) -> None:
+        """Wait for a single service to be ready"""
+        if self.compose is None:
+            return
+
+        logger.info(f"Waiting for {name}...")
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                host = self.compose.get_service_host(name, port)
+                mapped_port = self.compose.get_service_port(name, port)
+                if check(host, mapped_port):
+                    logger.info(f"{name} ready at {host}:{mapped_port}")
+                    self.services_info[name] = {
+                        "host": host,
+                        "port": mapped_port,
+                        "url": f"http://{host}:{mapped_port}",
+                    }
+                    return
+            except Exception:
+                pass
+            time.sleep(3)
+        raise TimeoutError(f"Timeout waiting for {name}")
+
+    def _check_qdrant(self, host: str, port: int) -> bool:
+        """Check if Qdrant is ready"""
+        try:
+            r = requests.get(f"http://{host}:{port}/collections", timeout=5)
+            return r.status_code == 200
+        except Exception:
+            return False
+
+    def _check_langfuse(self, host: str, port: int) -> bool:
+        """Check if Langfuse is ready"""
+        try:
+            r = requests.get(f"http://{host}:{port}/api/public/health", timeout=5)
+            return r.status_code == 200
+        except Exception:
+            return False
+
+    def _check_orchestration(self, host: str, port: int) -> bool:
+        """Check if LLM orchestration service is healthy"""
+        try:
+            r = requests.get(f"http://{host}:{port}/health", timeout=5)
+            return r.status_code == 200 and r.json().get("status") == "healthy"
+        except Exception:
+            return False
+
+    def _collect_service_info(self) -> None:
+        """Collect service connection information"""
+        if self.compose:
+            self.services_info["vault"] = {
+                "host": self.compose.get_service_host("vault", 8200),
+                "port": self.compose.get_service_port("vault", 8200),
+                "url": self._get_vault_url(),
+            }
+
+    def get_orchestration_service_url(self) -> str:
+        """Get the URL for the LLM orchestration service"""
+        return self.services_info["llm-orchestration-service"]["url"]
+
+    def get_qdrant_url(self) -> str:
+        """Get the URL for Qdrant"""
+        return self.services_info["qdrant"]["url"]
+
+    def get_vault_url(self) -> str:
+        """Get the URL for Vault"""
+        return self.services_info["vault"]["url"]
+
+    def get_langfuse_url(self) -> str:
+        """Get the URL for Langfuse"""
+        return self.services_info.get("langfuse-web", {}).get(
+            "url", "http://localhost:3000"
+        )
+
+    def is_service_available(self, service_name: str) -> bool:
+        """Check if a service is available"""
+        return service_name in self.services_info
+
+    def _index_test_data(self) -> None:
+        """Index test documents into Qdrant for retrieval testing."""
+        logger.info("Indexing test data into Qdrant contextual collections...")
+
+        try:
+            from tests.helpers.test_data_loader import load_test_data_into_qdrant
+
+            load_test_data_into_qdrant(
+                orchestration_url=self.get_orchestration_service_url(),
+                qdrant_url=self.get_qdrant_url(),
+            )
+
+            logger.info("Test data indexing complete")
+
+        except Exception as e:
+            logger.error(f"Failed to index test data: {e}")
+            raise
+
+
+# ===================== Pytest Fixtures =====================
+
+
+@pytest.fixture(scope="session")
+def rag_stack() -> Generator[RAGStackTestContainers, None, None]:
+    """
+    Session-scoped fixture that starts all test containers once per test session.
+    Containers are automatically stopped after all tests complete.
+    """
+    stack = RAGStackTestContainers()
+    try:
+        stack.start()
+        yield stack
+    except Exception as e:
+        # If startup fails, capture logs before cleanup
+        logger.error(f"RAG stack startup failed: {e}")
+        try:
+            stack._capture_service_logs()
+        except Exception as e:
+            logger.error(f"Could not capture logs after startup failure: {e}")
+            pass
+        raise
+    finally:
+        logger.info("=" * 80)
+        logger.info("CAPTURING SERVICE LOGS BEFORE CLEANUP")
+        logger.info("=" * 80)
+        try:
+            stack._capture_service_logs()
+        except Exception as e:
+            logger.error(f"Could not capture logs: {e}")
+        stack.stop()
+
+
+@pytest.fixture(scope="function")
+def orchestration_client(rag_stack: RAGStackTestContainers):
+    """
+    Function-scoped fixture that provides the orchestration service URL.
+    Tests can use either requests (sync) or httpx (async).
+    """
+    class OrchestrationClient:
+        def __init__(self, base_url: str):
+            self.base_url = base_url
+    
+    return OrchestrationClient(rag_stack.get_orchestration_service_url())
\ No newline at end of file
diff --git a/tests/data/test_dataset.json b/tests/data/test_dataset.json
index 259ba59..0b09b6e 100644
--- a/tests/data/test_dataset.json
+++ b/tests/data/test_dataset.json
@@ -1,183 +1,116 @@
 [
     {
-        "input": "How flexible will pensions become in 2021?",
-        "expected_output": "In 2021, pensions will become more flexible allowing people to choose the most suitable time for retirement, partially withdraw their pension, or stop pension payments if they wish, effectively creating their own personal pension plan.",
-        "retrieval_context": [
-            "In 2021, the pension will become more flexible. People will be able to choose the most suitable time for their retirement, partially withdraw their pension or stop payment of their pension if they wish, in effect creating their own personal pension plan."
-        ],
-        "category": "pension_information",
-        "language": "en"
-    },
-    {
-        "input": "Когда изменятся расчеты пенсионного возраста?",
-        "expected_output": "Начиная с 2027 года расчеты пенсионного возраста будут основываться на ожидаемой продолжительности жизни 65-летних людей. Пенсионная система таким образом будет соответствовать демографическим изменениям.",
-        "retrieval_context": [
-            "Starting in 2027, retirement age calculations will be based on the life expectancy of 65-year-olds. The pension system will thus be in line with demographic developments."
-        ],
-        "category": "pension_information",
-        "language": "ru"
+        "input": "Mida teha kui mobiil-ID kasutamisel kinnituskood ei ilmu mobiilile?",
+        "expected_output": "Veendu, et sinu telefon on mobiilvõrgu levialas ja mobiilne andmeside on sisse lülitatud. Lisaks tuleb kontrollida, kas mobiilsidevõrgus pole hetkel katkestusi. Mõnikord võib abiks olla telefoni taaskäivitamine ja/või selle võrguseadete lähtestamine.",
+        "category": "mobile_id_usage",
+        "language": "et"
     },
     {
-        "input": "Kui palju raha maksti peredele 2021. aastal?",
-        "expected_output": "2021. aastal maksti peredele kokku umbes 653 miljonit eurot toetusi, sealhulgas umbes 310 miljonit eurot peretoetuste eest ja 280 miljonit eurot lapsetoetuste eest.",
-        "retrieval_context": [
-            "In 2021, a total of approximately 653 million euros in benefits were paid to families. Approximately 310 million euros for family benefits; Approximately 280 million euros for parental benefit."
-        ],
-        "category": "family_benefits",
+        "input": "Mida teha, kui minu telefon Mobiil-ID-ga varastatakse?",
+        "expected_output": "Kui sinu telefon Mobiil-ID-ga on varastatud, tuleks kohe teavitada oma mobiilsideoperaatorit, et peatada mobiil-ID sertifikaadid. See aitab kaitsta sind identiteedivarguse eest. Kui leiad oma telefoni uuesti, saad jätkata vana SIM-kaardi kasutamist, kuid mobiil-ID funktsionaalsuse taastamiseks on vajalik tellida uus Mobiil-ID SIM-kaart operaatorilt.",
+        "category": "digital_identity_security",
         "language": "et"
     },
     {
-        "input": "Сколько семей получает поддержку для многодетных семей?",
-        "expected_output": "23,687 семей и 78,296 детей получают поддержку для многодетных семей, включая 117 семей с семью или более детьми.",
-        "retrieval_context": [
-            "23,687 families and 78,296 children receive support for families with many children, including 117 families with seven or more children."
-        ],
-        "category": "family_benefits",
-        "language": "ru"
+        "input": "Mida teha, kui mu ID-kaart on kadunud",
+        "expected_output": "Kui sinu ID-kaart on kadunud, tuleks tegutseda järgmiselt: Mõtle läbi, kas kaart võib olla turvalises kohas – näiteks kontorilaual või mujal, kus seda hiljuti kasutasid. Peata ID-kaardiga seotud sertifikaadid, helistades numbril +372 677 3377. Sertifikaatide peatamine tähendab, et kaarti ei saa kasutada elektroonilistes kanalites, kuid see kehtib endiselt füüsilise isikut tõendava dokumendina (nt reisidokumendina). Kui oled kindel, et kaart on varastatud, teavita politseid ja lase ID-kaart kehtetuks tunnistada. Kui kaardi kadumine toimub välismaal, saad Eesti välisesindusest taotleda ajutise tagasipöördumise tunnistuse. Kui vajad täiendavat abi, võid pöörduda Politsei- ja Piirivalveameti teenindusse.",
+        "category": "digital_identity_security",
+        "language": "et"
     },
     {
-        "input": "How many single parents receive support?",
-        "expected_output": "8,804 parents and 10,222 children receive single parent support.",
-        "retrieval_context": [
-            "8,804 parents and 1,0222 children receive single parent support."
-        ],
-        "category": "single_parent_support",
-        "language": "en"
+        "input": "Millised mobiilioperaatorid pakuvad Mobiil-ID-d Eestis?",
+        "expected_output": "Mobiil-ID SIM-kaarte Eestis väljustavad kolm mobiilioperaatorit: Telia, Elisa ja Tele2. Mobiil-ID SIM-kaarte saab taotleda nende operaatorite esindustes. Igal operaatoril on Mobiil-ID väljastamiseks oma reeglid, näiteks vanusepiirangud, teenusetasud ja kasutustingimused, kuid riiklikud nõuded on kõigile ühesugused. Kõik operaatorid peavad järgima samu riiklikke nõudeid, tagades Mobiil-ID ohutuse ja ühesuguse toimimise sõltumata kasutatavast operaatorist.",
+        "category": "digital_identity",
+        "language": "et"
     },
     {
-        "input": "Какие уровни бедности среди семей с одним родителем?",
-        "expected_output": "Семьи с одним родителем (в основном матери) находятся в группе наивысшего риска бедности: 5,3% живут в абсолютной бедности и 27,3% в относительной бедности.",
-        "retrieval_context": [
-            "Single-parent (mostly mother) families are at the highest risk of poverty, of whom 5.3% live in absolute poverty and 27.3% in relative poverty."
-        ],
-        "category": "single_parent_support",
-        "language": "ru"
+        "input": "Kuidas taotleda uut ID-kaardi koodiümbrikku?",
+        "expected_output": "Uue koodiümbriku taotlemiseks tuleb pöörduda Politsei- ja Piirivalveameti teenindusse või Eesti Vabariigi välisesindusse. Kaasa tuleb võtta isikut tõendav dokument. Võimalusel valida mobiilne pilet enne teeninduse külastamist. Maksa riigilõiv, mis rakendub koodiümbriku väljastamisel. Riigilõivu määrad leiad Politsei- ja Piirivalveameti kodulehelt. ui sinu ID-kaart on väljastatud pärast 17.11.2025, saad PUK-koodi vaadata ka Politsei- ja Piirivalveameti iseteenindusest, valides teenuse “ID-kaardi PUK-i vaatamine“",
+        "category": "id_card_usage",
+        "language": "et"
     },
     {
-        "input": "Millal saab piletit tagastada?",
-        "expected_output": "Pileti tagastamine on võimalik ainult juhul, kui reisi väljumiseni on jäänud vähemalt 60 minutit.",
-        "retrieval_context": [
-            "Pileti tagastamine on võimalik ainult juhul, kui reisi väljumiseni on jäänud vähemalt 60 minutit."
-        ],
-        "category": "train_services",
+        "input": "Mis on eIDAS määrus?",
+        "expected_output": "eIDAS määrus (electronic IDentification, Authentication and trust Services) on Euroopa Liidus kehtiv e-identimise ja e-tehingute määrus, mille eesmärk on lihtsustada piiriülest elektrooniliste teenuste tarbimist ühtsustatud standardite ja tegutsemispõhimõtete kaudu. Määrus võeti vastu 23. juulil 2014 ja alates 1. juulist 2016 peavad Euroopa Liidu riigid tunnustama teineteise e-allkirju ning Eesti kodanike digiallkirju peab aktsepteerima ka teiste EL liikmesriikide avaliku sektori asutused.",
+        "category": "digital_identity",
         "language": "et"
     },
     {
-        "input": "За сколько минут до отправления можно вернуть билет на поезд?",
-        "expected_output": "Возврат билета возможен только в том случае, если до отправления поездки остается не менее 60 минут.",
-        "retrieval_context": [
-            "Pileti tagastamine on võimalik ainult juhul, kui reisi väljumiseni on jäänud vähemalt 60 minutit."
-        ],
-        "category": "train_services",
-        "language": "ru"
+        "input": "Why am I getting an error when trying to sign documents in DigiDoc4?",
+        "expected_output": "This error occurs when your computer's clock time differs from the validation confirmation time. The system needs synchronized time to verify digital signatures properly.To fix this: Windows: Click the Start menu (Windows logo) in the bottom left corner, type \"Control Panel\" and open it. Then go to \"Date and Time\" settings and ensure your date, time, and time zone are correct. macOS: Open Spotlight Search and type \"System Settings\". Select \"General\", then adjust settings in \"Language & Region\" and \"Date & Time\". Ubuntu: Click \"Show applications\" icon in the bottom left corner, search for \"Date & time\" and open it. Before making changes, disable automatic configuration first. If the error persists: Disconnect any active VPN connection, restart your device, and disable antivirus/security software temporarily. If problems continue, contact ID support through their web form.",
+        "category": "digital_identity",
+        "language": "en"
     },
     {
-        "input": "Where is ticket refunded without charge?",
-        "expected_output": "The ticket cost is refunded to the Elron travel card without service charge only if the refund request is submitted through the Elron homepage refund form.",
-        "retrieval_context": [
-            "Pileti maksumus tagastatakse Elroni sõidukaardile teenustasuta ainult juhul, kui tagastussoov esitatakse Elroni kodulehe tagastusvormi kaudu."
-        ],
-        "category": "train_services",
+        "input": "What is an EORI number and do I need one?",
+        "expected_output": "An EORI (Economic Operators Registration and Identification) number is mandatory for businesses engaged in foreign trade when importing goods, transit shipments, exporting, or performing other customs operations. If you're doing any cross-border trade within or outside the EU, you need an EORI number.  To get one: In Estonia, the Tax and Customs Board issues EORI numbers. You can apply electronically through the e-MTA service portal at https://maasikas.emta.ee/login/?lang=et. After logging in, select \"Customs\" → \"Rights and obligations\" → \"Economic Operators Registration and Identification number (EORI)\". You need access rights to submit an application - see the access rights section for details. If you don't have access to e-MTA, you can submit your application at a Tax and Customs Board service location.",
+        "category": "emta",
         "language": "en"
     },
     {
-        "input": "Что сказала министр Кармен Йоллер о дезинформации в области здравоохранения?",
-        "expected_output": "Министр социальных дел Эстонии Кармен Йоллер заявила, что Европа должна действовать более совместно и скоординированно, чтобы остановить распространение дезинформации, связанной со здоровьем.",
-        "retrieval_context": [
-            "Europe must act more jointly and in a more coordinated way to stop the spread of health-related misinformation, said Estonia's Minister of Social Affairs, Karmen Joller."
-        ],
-        "category": "health_cooperation",
-        "language": "ru"
+        "input": "How do I check if a business partner is trustworthy before doing business with them?",
+        "expected_output": "You should verify your business partner's background through multiple public databases and sources. This helps you avoid tax fraud chains and ensure your partner can fulfill their obligations. Here's how to check them: Essential checks:  VAT registration: Check if they have a valid Estonian VAT number (KMKR) at https://apps.emta.ee/saqu/public/kmkrnr?lang=et. Without a valid VAT number, the seller cannot add VAT to transactions and the buyer cannot deduct input VAT. EU VAT number: For EU partners, verify their VAT number at https://ec.europa.eu/taxation_customs/vies/#/vat-validation. Check that the name matches the VAT number. Tax debt: Check for tax debts at https://apps.emta.ee/saqu/public/taxdebt?lang=et. Tax debt may indicate payment difficulties or problems meeting deadlines. Public data query: Get comprehensive company information at https://maasikas.emta.ee/rating/search including address, declared turnover, paid taxes, number of employees, outstanding declarations, and business bans. Business Register: Check company details at https://ariregister.rik.ee/ including registration code, legal form, share capital, and annual reports.  Additional verification:  Request tax data certificates directly through e-MTA - partners can send these showing their labor, salary, turnover, and tax debt data Check paid taxes tables to see how much they've paid in taxes and their declared turnover Search online for articles, forum discussions, payment default registers, and court decisions about the partner Check the Economic Activities Register (MTR) at https://mtr.ttja.ee/ for licenses and permits Use credit info databases like Inforegister, Creditinfo Estonia, or Äripäeva Infopank for credit ratings and payment behavior",
+        "category": "emta",
+        "language": "en"
     },
     {
-        "input": "What agreement did Estonia sign with Ukraine?",
-        "expected_output": "Estonian Minister of Social Affairs Karmen Joller and Ukrainian Minister of Health Viktor Liashko signed the next stage of a health cooperation agreement to reinforce health collaboration and support Ukraine's healthcare system recovery.",
-        "retrieval_context": [
-            "Estonian Minister of Social Affairs Karmen Joller and Ukrainian Minister of Health Viktor Liashko today signed the next stage of a health cooperation agreement.",
-            "The aim of the agreement is to reinforce health collaboration, support Ukraine's healthcare system recovery."
-        ],
-        "category": "health_cooperation",
-        "language": "en"
+        "input": "Kas ma saan kontrollida, kas minu äripartneril on karistusregister?",
+        "expected_output": "Jah, saate kontrollida karistusregistrit aadressil http://www.rik.ee/et/karistusregister 4 euro eest. Register sisaldab andmeid karistatud isikute (sh juriidiliste isikute) ja nende karistuste kohta. Kui registrist nähtub varasemaid maksualaseid süütegusid, on see riskitegur, et nendega äri tehes võite sattuda maksupettuse ahelasse. Korduvad rikkumised võivad viidata madalale seaduskuulekusele ja muuta tehingud riskantseks.",
+        "category": "emta",
+        "language": "et"
     },
     {
-        "input": "Какая цель соглашения о сотрудничестве с Украиной?",
-        "expected_output": "Цель соглашения - укрепить сотрудничество в области здравоохранения и поддержать восстановление системы здравоохранения Украины.",
-        "retrieval_context": [
-            "The aim of the agreement is to reinforce health collaboration, support Ukraine's healthcare system recovery."
-        ],
-        "category": "health_cooperation",
+        "input": "Как проверить, заслуживает ли деловой партнер доверия, перед началом сотрудничества?",
+        "expected_output": "Вам следует проверить информацию о вашем деловом партнере через несколько публичных баз данных и источников. Это поможет избежать цепочек налогового мошенничества и убедиться, что партнер может выполнить свои обязательства. Как их проверить: Основные проверки:  Регистрация НДС: Проверьте, есть ли у них действительный эстонский номер плательщика НДС (KMKR) на сайте https://apps.emta.ee/saqu/public/kmkrnr?lang=et. Без действительного номера плательщика НДС продавец не имеет права добавлять НДС к сделке, а покупатель не может вычесть входящий НДС. Номер НДС ЕС: Для партнеров из ЕС проверьте номер НДС на сайте https://ec.europa.eu/taxation_customs/vies/#/vat-validation. Убедитесь, что название соответствует номеру НДС. Налоговая задолженность: Проверьте наличие налоговых долгов на сайте https://apps.emta.ee/saqu/public/taxdebt?lang=et. Налоговая задолженность может указывать на платежные трудности или проблемы с соблюдением сроков. Запрос публичных данных: Получите подробную информацию о компании на сайте https://maasikas.emta.ee/rating/search, включая адрес, декларированный оборот, уплаченные налоги, количество сотрудников, непредставленные декларации и запреты на деятельность. Коммерческий регистр: Проверьте данные компании на сайте https://ariregister.rik.ee/, включая регистрационный код, юридическую форму, уставный капитал и годовые отчеты.  Дополнительная проверка:  Запросите справки о налоговых данных напрямую через e-MTA - партнеры могут отправлять их, показывая данные о рабочей силе, зарплате, обороте и налоговой задолженности Проверьте таблицы уплаченных налогов, чтобы увидеть, сколько они заплатили налогов и декларировали оборот Поищите в интернете статьи, темы обсуждений на форумах, результаты регистра платежных нарушений и судебные решения о партнере Проверьте Регистр экономической деятельности (MTR) на сайте https://mtr.ttja.ee/ на наличие лицензий и разрешений Используйте базы данных кредитной информации, такие как Inforegister, Creditinfo Estonia или Äripäeva Infopank для кредитных рейтингов и платежного поведения",
+        "category": "emta",
         "language": "ru"
     },
     {
-        "input": "Kuidas tellida sõiduplaani muudatuste teateid?",
-        "expected_output": "Saad tellida sõiduplaanimuudatuste ja hilinemiste teated e-postile.",
-        "retrieval_context": [
-            "Telli sõiduplaanimuudatuste ja hilinemiste teated e-postile."
-        ],
-        "category": "train_services",
+        "input": "Kus saab ID-kaarti isikut tõendava dokumendina kasutada?",
+        "expected_output": "ID-kaarti saab kasutada isikut tõendava dokumendina, digitaalsete teenuste kasutamiseks, nagu internetipank ja e-maksuamet, samuti digitaalse allkirja andmiseks ja dokumentide krüpteerimiseks. Samuti on võimalik seda kasutada Euroopa Liidu ja Euroopa Majanduspiirkonna piires reisidokumendina.",
+        "category": "id_card_usage",
         "language": "et"
     },
     {
-        "input": "Как подписаться на уведомления об изменениях в расписании поездов?",
-        "expected_output": "Вы можете подписаться на уведомления об изменениях расписания и опозданиях по электронной почте.",
-        "retrieval_context": [
-            "Telli sõiduplaanimuudatuste ja hilinemiste teated e-postile."
-        ],
-        "category": "train_services",
-        "language": "ru"
+        "input": "Kuidas aktiveerida Mobiil-ID?",
+        "expected_output": "Mobiil-ID aktiveerimine toimub operaatorite iseteeninduses (Telia, Elisa, Tele2).",
+        "category": "digital_identity",
+        "language": "et"
     },
     {
-        "input": "What are the contact details of the Ministry of Social Affairs?",
-        "expected_output": "Ministry of Social Affairs is located at Suur-Ameerika 1, 10122 Tallinn, phone +372 626 9301, email [email protected]. Open Monday-Thursday 8.30-17.15 and Friday 8.30-16.00.",
-        "retrieval_context": [
-            "Ministry of Social Affairs Suur-Ameerika 1, 10122 Tallinn +372 626 9301 [email protected] Open Mon -Thu 8.30-17.15 and Fri 8.30-16.00"
-        ],
-        "category": "contact_information",
-        "language": "en"
+        "input": "Mis on Eesti sotsiaaluuring ja miks ma peaksin osalema?",
+        "expected_output": "Eesti sotsiaaluuring aitab hinnata leibkondade ja isikute sissetulekute jaotust, elamistingimusi ning sotsiaalset tõrjutust. Samasugust uuringut korraldatakse kõikides Euroopa Liidu riikides. Teie osalemine aitab kujundada Eesti sotsiaalpoliitikat ja arendada teenuseid. Miks see on oluline: Need andmed aitavad riigijuhtidel, teadlastel ja arvamusliidridel mõista inimeste toimetulekut, vaesust, ilmajäetust, sissetulekut, elamistingimusi ja arstiabi kättesaadavust. Iga uuringusse sattunud inimene esindab umbes sadat Eesti elanikku, seega on teie vastused olulised. Kuidas osaleda: Kui teid valitakse, saate teavituskirja e-postile (rahvastikuregistris märgitud aadressile) või postiga. Saate küsimustikule vastata veebis aadressil https://estat.stat.ee, logides sisse ID-kaardi, mobiil-ID või Smart-ID-ga, või osaleda telefoniintervjuus. Uuring koosneb kahest osast: leibkonnaküsimustik (vastab põhiisik) ja isikuküsimustik (võib vastata iga vähemalt 15-aastane leibkonnaliige). Osalemine on vabatahtlik, kuid Statistikaamet kaitseb kõigi vastajate andmeid riikliku statistika seaduse ja isikuandmete kaitse seaduse alusel. Iga leibkonda küsitletakse neli aastat järjest 12 kuu tagant.",
+        "category": "statistics",
+        "language": "et"
     },
     {
-        "input": "Каковы контактные данные Министерства социальных дел?",
-        "expected_output": "Министерство социальных дел находится по адресу Суур-Амеэрика 1, 10122 Таллинн, телефон +372 626 9301, электронная почта [email protected]. Открыто понедельник-четверг 8.30-17.15 и пятница 8.30-16.00.",
-        "retrieval_context": [
-            "Ministry of Social Affairs Suur-Ameerika 1, 10122 Tallinn +372 626 9301 [email protected] Open Mon -Thu 8.30-17.15 and Fri 8.30-16.00"
-        ],
-        "category": "contact_information",
-        "language": "ru"
+        "input": "Kas ma saan kodus elektritöid ise teha või vajan spetsialisti?",
+        "expected_output": "Enamik elektritöid nõuab ohutuse huvides kvalifitseeritud spetsialisti. Valesti tehtud elektritöö võib põhjustada elektrilöögi ning tulekahjuohtu. Siiski saate lihtsamaid töid ise teha, kui teil on vastavad teadmised. Mida VÕITE ise teha:  Vahetada lüliteid, pistikupesi, lambipesi ja kaitsmeid (kuid MITTE paigaldada uusi) Parandada ja asendada juhtmelüliteid, lambipesi, pikendusjuhtmeid ja juhtmepistikuid  Milleks PEATE palkama spetsialisti:  Uute elektripaigaldiste ehitamine Uute pistikupesade ja lülitite paigaldamine Kohtkindlate kodumasinate ühendamine ja lahti ühendamine Kaitsekontaktita (maandamata) pistikupesade vahetamine kaitsekontaktiga (maandatud) pistikupesade vastu  Elektritöö ettevõtjad peavad olema esitanud majandustegevuse registrisse majandustegevuseteatise ning neil peab olema tööde eest vastutav kompetentne elektritöö juht.",
+        "category": "ttja",
+        "language": "et"
     },
     {
-        "input": "Сколько родителей-одиночек получают поддержку в Эстонии?",
-        "expected_output": "8,804 родителя и 10,222 ребенка получают поддержку для родителей-одиночек.",
-        "retrieval_context": [
-            "8,804 parents and 1,0222 children receive single parent support."
-        ],
-        "category": "single_parent_support",
-        "language": "ru"
+        "input": "What is an electrical installation audit and when do I need one?",
+        "expected_output": "An electrical installation audit checks whether your electrical system meets safety requirements and is safe to use. During the audit, the auditor visually assesses the installation's condition, reviews documentation and test/measurement results, and performs additional control measurements if necessary. When you need an audit:  Before commissioning: Required before putting a new or renovated building's electrical installation into use Periodic audits: Regular checks at intervals depending on the installation type and age. While not mandatory for residential spaces (private houses, apartments, summer cottages), you should still periodically check them to ensure safety and functionality  How to get an audit:  Only contractors with appropriate accreditation can perform audits Results and documents are digitally formatted in TTJA's information system at https://jvis.ttja.ee, where they're always accessible to the electrical installation owner For residential electrical system checks, contact a competent electrical professional or auditor who will perform necessary operations and provide feedback on the system's condition and safety.",
+        "category": "ttja",
+        "language": "en"
     },
     {
-        "input": "Когда Министерство социальных дел начало искать решения для поддержки семей с одним родителем?",
-        "expected_output": "С января 2022 года Министерство социальных дел ищет решения для поддержки семей с одним родителем.",
-        "retrieval_context": [
-            "Since January 2022, the Ministry of Social Affairs has been looking for solutions to support single-parent families."
-        ],
-        "category": "single_parent_support",
-        "language": "ru"
+        "input": "Mis on .asice ja .bdoc failide vahe?",
+        "expected_output": "Mõlemad on BDOC digiallkirjaformaadi alamformaadid. .asice (ASiC-E LT) on BDOC allkiri ajatempliga ja on parima rahvusvahelise ühilduvusega. DigiDoc4 kasutab kõikide uute dokumentide allkirjastamisel automaatselt .asice vormingut. .bdoc formaati kasutati varem, kuid uued allkirjad luuakse .asice formaadis.",
+        "category": "digital_signature",
+        "language": "et"
     },
     {
-        "input": "Какова была численность населения Эстонии согласно прогнозам?",
-        "expected_output": "Согласно прогнозам, население Эстонии сократится с 1,31 миллиона до 1,11 миллиона к 2060 году. Количество людей в возрасте 18-63 лет уменьшится на 256,000 человек, или на 32%.",
-        "retrieval_context": [
-            "According to forecasts, the population of Estonia will decrease from 1.31 million to 1.11 million by 2060. The number of people aged 18-63 will decrease by 256,000, or 32%."
-        ],
-        "category": "pension_information",
-        "language": "ru"
+        "input": "How long is the e-residency digi-ID valid for?",
+        "expected_output": "The e-residency digi-ID is valid for 5 years",
+        "category": "digital_identity",
+        "language": "en"
     },
     {
-        "input": "Какая была новая инновационная программа стоимостью 12 миллионов евро?",
-        "expected_output": "На Фестивале социальных технологий была представлена новая инновационная программа стоимостью 12 миллионов евро, направленная на поддержку самостоятельной жизни пожилых людей и людей с ограниченными возможностями с помощью технологических решений.",
-        "retrieval_context": [
-            "New €12 million innovation programme unveiled at Welfare Technology Festival aimed at supporting independent living for older adults and people with disabilities through technology-driven solutions."
-        ],
-        "category": "health_cooperation",
+        "input": "Предоставляет ли электронное резидентство эстонское гражданство или налоговое резидентство?",
+        "expected_output": "Нет, электронное резидентство не предоставляет эстонское гражданство или налоговое резидентство.",
+        "category": "digital_identity",
         "language": "ru"
     }
 ]
\ No newline at end of file
diff --git a/tests/deepeval_tests/red_team_tests.py b/tests/deepeval_tests/red_team_tests.py
index 0413913..b71f1ec 100644
--- a/tests/deepeval_tests/red_team_tests.py
+++ b/tests/deepeval_tests/red_team_tests.py
@@ -4,6 +4,7 @@
 import sys
 import datetime
 import pytest
+import requests
 
 from deepteam import red_team
 from deepteam.attacks.single_turn import (
@@ -35,7 +36,6 @@
 )
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
-from mocks.dummy_llm_orchestrator import process_query
 
 
 class ComprehensiveResultCollector:
@@ -192,11 +192,9 @@ def setup_class(cls):
         ]
 
         cls.multi_turn_attacks = [
-            LinearJailbreaking(turns=3, weight=1),  # Reduced turns for faster execution
+            LinearJailbreaking(turns=3, weight=1),
             SequentialJailbreak(type="dialogue", persona="student", weight=1),
-            CrescendoJailbreaking(
-                max_rounds=5, max_backtracks=5, weight=1
-            ),  # Reduced complexity
+            CrescendoJailbreaking(max_rounds=5, max_backtracks=5, weight=1),
         ]
 
         # All vulnerabilities organized by category
@@ -230,17 +228,46 @@ def setup_class(cls):
             f"Loaded {len(cls.all_vulnerabilities)} vulnerabilities for comprehensive testing"
         )
 
+        # Store orchestration URL for use in tests
+        cls.orchestration_url = None
+
+    def setup_method(self, method):
+        """Setup method called before each test - captures orchestration client."""
+        # This will be set by the orchestration_client fixture
+        pass
+
     async def model_callback(self, input_text: str, language: str = "en") -> str:
-        """Model callback function for DeepTeam red teaming."""
+        """Model callback function for DeepTeam red teaming - uses orchestration API."""
         try:
             if not input_text or len(input_text.strip()) == 0:
                 return "I cannot process empty queries."
 
-            result = process_query(
-                question=input_text, provider="anthropic", include_contexts=True
+            # Use the orchestration service endpoint
+            api_request = {
+                "chatId": f"redteam-{datetime.datetime.now().timestamp()}",
+                "message": input_text,
+                "authorId": "redteam-tester",
+                "conversationHistory": [],
+                "url": "https://test.example.com",
+                "environment": "test",
+                "connection_id": "evalconnection-1",
+            }
+
+            response = requests.post(
+                f"{self.orchestration_url}/orchestrate-eval",
+                json=api_request,
+                timeout=60,
             )
-            return result["response"]
+            response.raise_for_status()
+            result = response.json()
+
+            return result.get("content", "No response generated")
+
+        except requests.exceptions.RequestException as e:
+            print(f"API request failed: {e}")
+            return f"Error processing query: {str(e)}"
         except Exception as e:
+            print(f"Model callback error: {e}")
             return f"Error processing query: {str(e)}"
 
     def _evaluate_results(self, red_team_results: Any) -> bool:
@@ -265,10 +292,14 @@ def _evaluate_results(self, red_team_results: Any) -> bool:
             print(f"Evaluation error: {e}")
             return False
 
-    def test_comprehensive_security_assessment(self):
+    def test_comprehensive_security_assessment(self, orchestration_client):
         """Run comprehensive security assessment covering all attack vectors and vulnerabilities."""
+        # Get orchestration URL from fixture
+        self.orchestration_url = orchestration_client.base_url
+
         print("\n" + "=" * 80)
         print("STARTING COMPREHENSIVE RAG SYSTEM SECURITY ASSESSMENT")
+        print(f"Orchestration URL: {self.orchestration_url}")
         print("=" * 80)
 
         start_time = datetime.datetime.now()
diff --git a/tests/deepeval_tests/report_generator.py b/tests/deepeval_tests/report_generator.py
index 2321cbe..0db45ac 100644
--- a/tests/deepeval_tests/report_generator.py
+++ b/tests/deepeval_tests/report_generator.py
@@ -155,22 +155,18 @@ def generate_failure_analysis(results: Dict[str, Any]) -> str:
     analysis += "| Test | Query | Metric | Score | Issue |\n"
     analysis += "|------|--------|--------|-------|-------|\n"
 
-    for failure in failed_results[:10]:  # Limit to first 10 failures
+    for failure in failed_results:  # Limit to first 10 failures
         query_preview = (
             failure["input"][:50] + "..."
             if len(failure["input"]) > 50
             else failure["input"]
         )
         reason_preview = (
-            failure["reason"][:100] + "..."
-            if len(failure["reason"]) > 100
-            else failure["reason"]
-        )
+            failure["reason"] )
 
         analysis += f"| {failure['test_case']} | {query_preview} | {failure['metric']} | {failure['score']:.2f} | {reason_preview} |\n"
 
-    if len(failed_results) > 10:
-        analysis += f"\n*({len(failed_results) - 10} additional failures not shown)*\n"
+
 
     analysis += "\n"
     return analysis
diff --git a/tests/deepeval_tests/standard_tests.py b/tests/deepeval_tests/standard_tests.py
index a30e284..19d4b33 100644
--- a/tests/deepeval_tests/standard_tests.py
+++ b/tests/deepeval_tests/standard_tests.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 import sys
 import datetime
+import requests
 from deepeval.test_case import LLMTestCase
 from deepeval.metrics.answer_relevancy.answer_relevancy import AnswerRelevancyMetric
 from deepeval.metrics import (
@@ -12,9 +13,11 @@
     ContextualRelevancyMetric,
     FaithfulnessMetric,
 )
+import asyncio
+import httpx
+
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
-from mocks.dummy_llm_orchestrator import process_query
 
 
 class StandardResultCollector:
@@ -107,8 +110,11 @@ def save_results_fixture():
     standard_results_collector.save_results("pytest_captured_results.json")
 
 
+import httpx  # Replace requests with httpx
+import asyncio
+
 class TestRAGSystem:
-    """Test suite for RAG system evaluation using DeepEval metrics."""
+    """Test suite for RAG system evaluation using DeepEval metrics via API."""
 
     @classmethod
     def setup_class(cls):
@@ -129,23 +135,6 @@ def setup_class(cls):
 
         print(f"Loaded {len(cls.test_data)} test cases")
 
-    def create_test_case(
-        self, data_item: Dict[str, Any], provider: str = "anthropic"
-    ) -> LLMTestCase:
-        """Create a DeepEval test case from data item."""
-        # Generate actual output using the dummy orchestrator
-        result = process_query(
-            question=data_item["input"], provider=provider, include_contexts=True
-        )
-
-        llm_test_case = LLMTestCase(
-            input=data_item["input"],
-            actual_output=result["response"],
-            expected_output=data_item["expected_output"],
-            retrieval_context=result["retrieval_context"],
-        )
-        return llm_test_case
-
     @pytest.mark.parametrize(
         "test_item",
         [
@@ -159,20 +148,76 @@ def create_test_case(
             )
         ],
     )
-    def test_all_metrics(self, test_item: Dict[str, Any]):
-        """Test all metrics for each test case and collect results."""
-        test_case = self.create_test_case(test_item)
+    @pytest.mark.asyncio
+    async def test_all_metrics(self, test_item: Dict[str, Any], orchestration_client):
+        """Async version of DeepEval test with parallel metric execution."""
 
-        # Get test case index for consistent numbering
+        orchestration_url = orchestration_client.base_url
         test_case_num = self.test_data.index(test_item) + 1
-
         print(f"\nTesting case {test_case_num}: {test_item['input'][:50]}...")
 
-        # Initialize metrics results
-        metrics_results = {}
-        failed_assertions = []
+        # --- USE ASYNC HTTP CLIENT ---
+        result = None
+        async with httpx.AsyncClient(timeout=60.0) as client:
+            try:
+                response = await client.post(
+                    f"{orchestration_url}/orchestrate-eval",
+                    json={
+                        "chatId": f"test-{test_item.get('id', 'unknown')}",
+                        "message": test_item["input"],
+                        "authorId": "deepeval-tester",
+                        "conversationHistory": [],
+                        "url": "https://test.example.com",
+                        "environment": "development",
+                        "connection_id": "evalconnection-1",
+                    },
+                )
+                response.raise_for_status()
+                result = response.json()
+            except httpx.RequestError as e:
+                result = {"content": f"API Error: {str(e)}", "retrieval_context": []}
+            except Exception as e:
+                result = {"content": f"Unexpected error: {str(e)}", "retrieval_context": []}
+        if result is None:
+            result = {"content": "No response received", "retrieval_context": []}
+        # --- DEBUG LOGGING ---
+        print("=" * 80)
+        print(f"TEST CASE {test_case_num} API RESPONSE DEBUG")
+        print("=" * 80)
+        print(f"Response keys: {list(result.keys())}")
+        for key, value in result.items():
+            
+            print(key, value)
+        print(f"Content length: {len(result.get('content', ''))}")
+        print(f"Retrieval context: {len(result.get('retrieval_context', []))} chunks")
+        
+        if result.get('retrieval_context'):
+            for chunk in result['retrieval_context']:
+                print(chunk.keys())
+                context = chunk.get('content', '') if isinstance(chunk, dict) else str(chunk)
+                meta = chunk.get('metadata', {}) if isinstance(chunk, dict) else {}
+                fused_score = meta.get('fused_score', 'N/A')
+                bm25_score = meta.get('bm25_score', 'N/A')
+                semantic_score = meta.get('semantic_score', 'N/A')
+                print(f"Chunk (fused: {fused_score}, bm25: {bm25_score}, semantic: {semantic_score}):\n {context}\n\n")
+        else:
+            print("WARNING: No retrieval context returned!")
+        print("=" * 80)
 
-        # Define all metrics to test
+        retrieval_context = result.get("retrieval_context") or []
+        retrieval_context = [
+            c.get("content", "") if isinstance(c, dict) else str(c)
+            for c in retrieval_context
+        ]
+
+        llm_test_case = LLMTestCase(
+            input=test_item["input"],
+            actual_output=result.get("content", ""),
+            expected_output=test_item["expected_output"],
+            retrieval_context=retrieval_context,
+        )
+
+        # --- Run metrics concurrently ---
         metrics = [
             ("contextual_precision", self.contextual_precision),
             ("contextual_recall", self.contextual_recall),
@@ -181,38 +226,28 @@ def test_all_metrics(self, test_item: Dict[str, Any]):
             ("faithfulness", self.faithfulness),
         ]
 
-        # Test each metric and collect results
-        for metric_name, metric in metrics:
+        async def run_metric(metric_name, metric):
             try:
-                metric.measure(test_case)
+                await asyncio.to_thread(metric.measure, llm_test_case)
                 score = metric.score
-                passed = score >= 0.7
-                reason = metric.reason
-
-                metrics_results[metric_name] = {
+                return metric_name, {
                     "score": score,
-                    "passed": passed,
-                    "reason": reason,
+                    "passed": score >= 0.4,
+                    "reason": metric.reason,
                 }
-
-                print(f"  {metric_name}: {score:.3f} ({'PASS' if passed else 'FAIL'})")
-
-                # Collect failed assertions but don't raise immediately
-                if not passed:
-                    failed_assertions.append(
-                        f"{metric_name} failed for query: '{test_item['input']}'. "
-                        f"Score: {score}, Reason: {reason}"
-                    )
-
             except Exception as e:
-                metrics_results[metric_name] = {
+                return metric_name, {
                     "score": 0.0,
                     "passed": False,
                     "reason": f"Error: {str(e)}",
                 }
-                failed_assertions.append(f"{metric_name} error: {str(e)}")
 
-        # Always add results to collector, regardless of pass/fail
+        metric_results_list = await asyncio.gather(
+            *(run_metric(name, metric) for name, metric in metrics)
+        )
+        metrics_results = dict(metric_results_list)
+
+        # --- Collect results ---
         try:
             standard_results_collector.add_test_result(
                 test_case_num=test_case_num,
@@ -224,7 +259,7 @@ def test_all_metrics(self, test_item: Dict[str, Any]):
         except Exception as e:
             print(f"Error adding test result: {e}")
 
-        # Now raise assertion if any metrics failed (for pytest reporting)
-        if failed_assertions:
-            # Just raise the first failure to keep pytest output clean
-            raise AssertionError(failed_assertions[0])
+        # --- Assert ---
+        failed = [name for name, res in metrics_results.items() if not res["passed"]]
+        if failed:
+            pytest.fail(f"Metrics failed: {', '.join(failed)} for input: {test_item['input'][:50]}")
diff --git a/tests/helpers/test_data_loader.py b/tests/helpers/test_data_loader.py
new file mode 100644
index 0000000..8296798
--- /dev/null
+++ b/tests/helpers/test_data_loader.py
@@ -0,0 +1,174 @@
+"""Helper module to load test data into Qdrant before running tests."""
+import json
+import uuid
+from typing import List, Dict, Any, Tuple
+from pathlib import Path
+from loguru import logger
+from datetime import datetime
+import httpx
+
+
+def load_test_data_into_qdrant(
+    orchestration_url: str,
+    qdrant_url: str,
+) -> None:
+    """Load test documents into Qdrant contextual collections for retrieval testing."""
+    logger.info("Loading test data into Qdrant contextual collections...")
+
+    # Load pre-computed embeddings
+    embeddings_file = Path(__file__).parent.parent / "data" / "test_embeddings.json"
+    
+    if not embeddings_file.exists():
+        raise FileNotFoundError(
+            f"Pre-computed embeddings not found at {embeddings_file}. "
+            "Run create_embeddings.py first!"
+        )
+    
+    logger.info(f"Loading pre-computed embeddings from {embeddings_file}")
+    chunks_data, model_used = load_precomputed_embeddings(embeddings_file)
+    
+    # Index into Qdrant
+    index_embeddings_to_qdrant(
+        qdrant_url=qdrant_url,
+        chunks_data=chunks_data,
+        model_used=model_used
+    )
+
+
+def load_precomputed_embeddings(
+    embeddings_file: Path
+) -> Tuple[List[Dict[str, Any]], str]:
+    """Load pre-computed embeddings from file."""
+    with open(embeddings_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    
+    chunks_data = data["chunks"]
+    model_used = data["model_used"]
+    
+    logger.info(f"Loaded {len(chunks_data)} pre-computed chunks")
+    logger.info(f"   Vector size: {data['vector_size']}")
+    logger.info(f"   Model: {model_used}")
+    logger.info(f"   Documents: {data['total_documents']}")
+    
+    return chunks_data, model_used
+
+
+def index_embeddings_to_qdrant(
+    qdrant_url: str,
+    chunks_data: List[Dict[str, Any]],
+    model_used: str
+) -> None:
+    """Index embeddings into Qdrant."""
+    if not chunks_data:
+        logger.warning("No chunks to index")
+        return
+    
+    vector_size = chunks_data[0]["vector_dimensions"]
+    collection_name = _determine_collection_from_model(model_used)
+    
+    logger.info(f"Indexing into Qdrant collection: {collection_name}")
+    
+    client = httpx.Client(timeout=30.0)
+    
+    try:
+        # Check if collection exists
+        response = client.get(f"{qdrant_url}/collections/{collection_name}")
+        
+        if response.status_code == 404:
+            logger.info(f"Creating collection '{collection_name}'...")
+            create_payload = {
+                "vectors": {
+                    "size": vector_size,
+                    "distance": "Cosine",
+                },
+                "optimizers_config": {"default_segment_number": 2},
+                "replication_factor": 1,
+            }
+            
+            response = client.put(
+                f"{qdrant_url}/collections/{collection_name}", 
+                json=create_payload
+            )
+            
+            if response.status_code not in [200, 201]:
+                raise RuntimeError(f"Failed to create collection: {response.text}")
+            
+            logger.info(f"Created collection '{collection_name}'")
+        else:
+            logger.info(f"Collection '{collection_name}' already exists")
+        
+        # Prepare points
+        points = []
+        for chunk in chunks_data:
+            point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk["chunk_id"]))
+            
+            payload = {
+                "chunk_id": chunk["chunk_id"],
+                "document_hash": chunk["document_hash"],
+                "chunk_index": chunk["chunk_index"],
+                "total_chunks": chunk["total_chunks"],
+                "original_content": chunk["original_content"],
+                "contextual_content": chunk["contextual_content"],
+                "context_only": chunk["context"],
+                "embedding_model": chunk["embedding_model"],
+                "vector_dimensions": chunk["vector_dimensions"],
+                "document_url": chunk["metadata"].get("source", "test_document"),
+                "dataset_collection": chunk["metadata"].get("dataset_collection", "test_collection"),
+                "processing_timestamp": datetime.now().isoformat(),
+                "tokens_count": chunk["tokens_count"],
+                **chunk["metadata"],
+            }
+            
+            points.append({
+                "id": point_id,
+                "vector": chunk["embedding"],
+                "payload": payload
+            })
+        
+        # Upsert points in batches
+        batch_size = 100
+        for i in range(0, len(points), batch_size):
+            batch = points[i : i + batch_size]
+            upsert_payload = {"points": batch}
+            
+            response = client.put(
+                f"{qdrant_url}/collections/{collection_name}/points",
+                json=upsert_payload,
+            )
+            
+            if response.status_code not in [200, 201]:
+                raise RuntimeError(f"Failed to upsert points: {response.text}")
+            
+            logger.info(f"Indexed batch {i // batch_size + 1} ({len(batch)} points)")
+        
+        client.close()
+        
+        logger.info(f" Successfully indexed {len(points)} chunks into Qdrant")
+        
+    except Exception as e:
+        logger.error(f"Failed to index to Qdrant: {e}")
+        raise
+
+
+def _determine_collection_from_model(model_name: str) -> str:
+    """Determine which Qdrant collection to use based on embedding model."""
+    model_lower = model_name.lower()
+
+    # Azure OpenAI models -> contextual_chunks_azure
+    if any(
+        keyword in model_lower for keyword in ["azure", "text-embedding", "ada-002"]
+    ):
+        return "contextual_chunks_azure"
+
+    # AWS Bedrock models -> contextual_chunks_aws
+    elif any(
+        keyword in model_lower for keyword in ["titan", "amazon", "aws", "bedrock"]
+    ):
+        return "contextual_chunks_aws"
+
+    # Default to Azure collection
+    else:
+        logger.warning(
+            f"Unknown model {model_name}, defaulting to contextual_chunks_azure"
+        )
+        return "contextual_chunks_azure"
\ No newline at end of file
diff --git a/tests/mocks/__init__.py b/tests/mocks/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/mocks/dummy_llm_orchestrator.py b/tests/mocks/dummy_llm_orchestrator.py
deleted file mode 100644
index 12332f9..0000000
--- a/tests/mocks/dummy_llm_orchestrator.py
+++ /dev/null
@@ -1,274 +0,0 @@
-import os
-from typing import List, Dict, Any
-from dotenv import load_dotenv
-import anthropic
-
-load_dotenv()
-
-
-class MockQdrantRetriever:
-    """Mock implementation of Qdrant vector database with predefined test data."""
-
-    def __init__(self):
-        self.knowledge_base: Dict[str, List[str]] = {
-            "pension": [
-                "In 2021, the pension will become more flexible. People will be able to choose the most suitable time for their retirement, partially withdraw their pension or stop payment of their pension if they wish, in effect creating their own personal pension plan.",
-                "Starting in 2027, retirement age calculations will be based on the life expectancy of 65-year-olds. The pension system will thus be in line with demographic developments.",
-                "From 2021, the formula for the state old-age pension will be upgraded - starting in 2021, we will start collecting the so-called joint part.",
-            ],
-            "family_benefits": [
-                "In 2021, a total of approximately 653 million euros in benefits were paid to families. Approximately 310 million euros for family benefits; Approximately 280 million euros for parental benefit.",
-                "The Estonian parental benefit system is one of the most generous in the world, both in terms of the length of the period covered by the benefit and the amount of the benefit.",
-                "23,687 families and 78,296 children receive support for families with many children, including 117 families with seven or more children.",
-            ],
-            "single_parent": [
-                "8,804 parents and 1,0222 children receive single parent support.",
-                "Single-parent (mostly mother) families are at the highest risk of poverty, of whom 5.3% live in absolute poverty and 27.3% in relative poverty.",
-                "Since January 2022, the Ministry of Social Affairs has been looking for solutions to support single-parent families.",
-            ],
-            "train_tickets": [
-                "Ticket refund is only possible if at least 60 minutes remain until the departure of the trip.",
-                "The ticket cost is refunded to the Elron travel card without service charge only if the refund request is submitted through the Elron homepage refund form.",
-                "If ticket refund is requested to a bank account, a service fee of 1 euro is deducted from the refundable amount.",
-            ],
-            "health_cooperation": [
-                "Europe must act more jointly and in a more coordinated way to stop the spread of health-related misinformation, said Estonia's Minister of Social Affairs, Karmen Joller.",
-                "Estonian Minister of Social Affairs Karmen Joller and Ukrainian Minister of Health Viktor Liashko today signed the next stage of a health cooperation agreement.",
-                "The aim of the agreement is to reinforce health collaboration, support Ukraine's healthcare system recovery.",
-            ],
-        }
-
-    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
-        """Mock hybrid vector + BM25 search and re-ranking."""
-        query_lower = query.lower()
-
-        # Simple keyword matching for mock retrieval
-        relevant_contexts: list[str] = []
-
-        # Check for topic keywords in query (expanded multilingual support)
-        topic_keywords = {
-            "pension": [
-                "pension",
-                "pensioni",
-                "pensionieaarvutus",
-                "retirement",
-                "vanaduspension",
-                "пенсия",
-                "пенсионный",
-                "возраст",
-                "расчеты",
-                "гибк",
-            ],
-            "family_benefits": [
-                "family",
-                "benefit",
-                "toetus",
-                "pere",
-                "lapsetoetus",
-                "parental",
-                "семья",
-                "пособие",
-                "семейный",
-                "родитель",
-                "дети",
-                "поддержка",
-                "palju",
-                "raha",
-                "maksti",
-                "peredele",
-            ],
-            "single_parent": [
-                "single",
-                "parent",
-                "üksikvanem",
-                "poverty",
-                "vaesus",
-                "одиночек",
-                "родител",
-                "бедност",
-                "поддержка",
-                "семей",
-            ],
-            "train_services": [
-                "train",
-                "ticket",
-                "pilet",
-                "elron",
-                "tagastamine",
-                "refund",
-                "поезд",
-                "билет",
-                "возврат",
-                "отправлени",
-                "минут",
-                "расписани",
-                "sõiduplaan",
-                "teated",
-                "уведомлени",
-            ],
-            "health_cooperation": [
-                "health",
-                "cooperation",
-                "karmen",
-                "joller",
-                "ukraine",
-                "misinformation",
-                "здравоохранени",
-                "сотрудничеств",
-                "соглашени",
-                "украин",
-                "дезинформаци",
-                "tervis",
-                "koostöö",
-                "leping",
-                "innovation",
-                "инноваци",
-            ],
-            "contact_information": [
-                "ministry",
-                "contact",
-                "ministeerium",
-                "newsletter",
-                "uudiskiri",
-                "министерств",
-                "контакт",
-                "социальн",
-                "данные",
-                "адрес",
-            ],
-        }
-
-        # Find matching topics
-        matching_topics: list[str] = []
-        for topic, keywords in topic_keywords.items():
-            if any(keyword in query_lower for keyword in keywords):
-                matching_topics.append(topic)
-
-        # Get contexts from matching topics
-        for topic in matching_topics:
-            if topic in self.knowledge_base:
-                relevant_contexts.extend(self.knowledge_base[topic])
-
-        # If no specific match, return some general contexts
-        if not relevant_contexts:
-            relevant_contexts = (
-                self.knowledge_base["pension"][:2]
-                + self.knowledge_base["family_benefits"][:1]
-            )
-
-        # Remove duplicates while preserving order
-        seen: set[str] = set()
-        unique_contexts: list[str] = []
-        for context in relevant_contexts:
-            if context not in seen:
-                seen.add(context)
-                unique_contexts.append(context)
-
-        return unique_contexts[:top_k]
-
-
-class DummyLLMOrchestrator:
-    """Main orchestrator that handles the complete RAG pipeline."""
-
-    def __init__(self, provider: str = "anthropic"):
-        self.provider = provider
-        self.retriever = MockQdrantRetriever()
-
-        if provider == "anthropic":
-            self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
-        else:
-            raise ValueError("Provider must be 'anthropic' or 'openai'")
-
-    def _generate_with_anthropic(self, prompt: str) -> str:
-        """Generate response using Anthropic Claude."""
-        try:
-            response = self.client.messages.create(
-                model="claude-3-7-sonnet-20250219",
-                max_tokens=1024,
-                temperature=0.7,
-                messages=[{"role": "user", "content": prompt}],
-            )
-            text: str = response.content[0].text
-            return text
-        except Exception as e:
-            return f"Error generating response with Anthropic: {str(e)}"
-
-    def _mock_nvidia_nemo_guardrail(self, response: str) -> bool:
-        """Mock NVIDIA NeMO output guardrail check."""
-        # Simple mock: reject responses that are too short or contain error messages
-        if len(response) < 10 or "error" in response.lower():
-            return False
-        return True
-
-    def generate_response(
-        self, question: str, include_contexts: bool = False
-    ) -> Dict[str, Any]:
-        """
-        Complete RAG pipeline: retrieve contexts and generate response.
-
-        Args:
-            question: User's question
-            include_contexts: Whether to include retrieval contexts in response
-
-        Returns:
-            Dictionary containing response and optionally contexts
-        """
-        # Step 1: Retrieve contexts using hybrid search
-        contexts = self.retriever.retrieve(question, top_k=3)
-
-        # Step 2: Construct prompt with retrieved contexts
-        context_text = "\n\n".join(contexts)
-        prompt = f"""Based on the following context information, please answer the question accurately and helpfully.
-
-Context:
-{context_text}
-
-Question: {question}
-
-Answer:"""
-
-        # Step 3: Generate response with LLMs
-        max_attempts = 2
-        response: str = ""
-        for attempt in range(max_attempts):
-            if self.provider == "anthropic":
-                response: str = self._generate_with_anthropic(prompt)
-            else:
-                response: str = "Unsupported provider."
-            # Step 4: Check with NVIDIA NeMO guardrail
-            if self._mock_nvidia_nemo_guardrail(response):
-                break
-            elif attempt == max_attempts - 1:
-                response = (
-                    "I'm sorry, I cannot provide a suitable response at this time."
-                )
-
-        result: dict[str, str | list[str]] = {"response": response}
-        if include_contexts:
-            result["retrieval_context"] = contexts
-
-        return result
-
-
-# API endpoint functions for testing
-def create_llm_orchestrator(provider: str = "anthropic") -> DummyLLMOrchestrator:
-    """Factory function to create LLM orchestrator."""
-    return DummyLLMOrchestrator(provider)
-
-
-def process_query(
-    question: str, provider: str = "anthropic", include_contexts: bool = False
-) -> Dict[str, Any]:
-    """
-    Process a single query through the RAG pipeline.
-
-    Args:
-        question: User's question
-        provider: LLM provider ('anthropic' or 'openai')
-        include_contexts: Whether to include retrieval contexts
-
-    Returns:
-        Dictionary with response and optionally contexts
-    """
-    orchestrator = create_llm_orchestrator(provider)
-    return orchestrator.generate_response(question, include_contexts)
diff --git a/uv.lock b/uv.lock
index f662ff5..d673236 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = "==3.12.10"
 
 [[package]]
@@ -170,6 +170,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/83/7b/5652771e24fff12da9dde4c20ecf4682e606b104f26419d139758cc935a6/azure_identity-1.25.1-py3-none-any.whl", hash = "sha256:e9edd720af03dff020223cd269fa3a61e8f345ea75443858273bcb44844ab651", size = 191317, upload-time = "2025-10-06T20:30:04.251Z" },
 ]
 
+[[package]]
+name = "azure-storage-blob"
+version = "12.27.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "azure-core" },
+    { name = "cryptography" },
+    { name = "isodate" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/36/7c/2fd872e11a88163f208b9c92de273bf64bb22d0eef9048cc6284d128a77a/azure_storage_blob-12.27.1.tar.gz", hash = "sha256:a1596cc4daf5dac9be115fcb5db67245eae894cf40e4248243754261f7b674a6", size = 597579, upload-time = "2025-10-29T12:27:16.185Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3d/9e/1c90a122ea6180e8c72eb7294adc92531b0e08eb3d2324c2ba70d37f4802/azure_storage_blob-12.27.1-py3-none-any.whl", hash = "sha256:65d1e25a4628b7b6acd20ff7902d8da5b4fde8e46e19c8f6d213a3abc3ece272", size = 428954, upload-time = "2025-10-29T12:27:18.072Z" },
+]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -923,6 +938,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "isodate"
+version = "0.7.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" },
+]
+
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -2118,11 +2142,13 @@ source = { virtual = "." }
 dependencies = [
     { name = "anthropic" },
     { name = "azure-identity" },
+    { name = "azure-storage-blob" },
     { name = "boto3" },
     { name = "deepeval" },
     { name = "deepteam" },
     { name = "dspy" },
     { name = "fastapi" },
+    { name = "httpx" },
     { name = "hvac" },
     { name = "langfuse" },
     { name = "loguru" },
@@ -2133,6 +2159,7 @@ dependencies = [
     { name = "pydantic" },
     { name = "pyright" },
     { name = "pytest" },
+    { name = "pytest-asyncio" },
     { name = "pytest-json-report" },
     { name = "python-dotenv" },
     { name = "pyyaml" },
@@ -2150,11 +2177,13 @@ dependencies = [
 requires-dist = [
     { name = "anthropic", specifier = ">=0.69.0" },
     { name = "azure-identity", specifier = ">=1.24.0" },
+    { name = "azure-storage-blob", specifier = ">=12.27.1" },
     { name = "boto3", specifier = ">=1.40.25" },
     { name = "deepeval", specifier = ">=3.6.0" },
     { name = "deepteam", specifier = ">=0.2.5" },
     { name = "dspy", specifier = ">=3.0.3" },
     { name = "fastapi", specifier = ">=0.116.1" },
+    { name = "httpx", specifier = ">=0.28.1" },
     { name = "hvac", specifier = ">=2.3.0" },
     { name = "langfuse", specifier = ">=3.8.1" },
     { name = "loguru", specifier = ">=0.7.3" },
@@ -2165,6 +2194,7 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.11.7" },
     { name = "pyright", specifier = ">=1.1.407" },
     { name = "pytest", specifier = ">=8.4.1" },
+    { name = "pytest-asyncio", specifier = ">=1.2.0" },
     { name = "pytest-json-report", specifier = ">=1.5.0" },
     { name = "python-dotenv", specifier = ">=1.1.1" },
     { name = "pyyaml", specifier = ">=6.0.2" },

From fd0189604cfefb4209edbe948f54f6287f06348b Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 11:40:04 +0200
Subject: [PATCH 2/7] add branch name to deepeval test

---
 .github/workflows/deepeval-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/deepeval-tests.yml b/.github/workflows/deepeval-tests.yml
index 22fef17..53a6b70 100644
--- a/.github/workflows/deepeval-tests.yml
+++ b/.github/workflows/deepeval-tests.yml
@@ -3,7 +3,7 @@ name: DeepEval RAG System Tests
 on:
   pull_request:
     types: [opened, synchronize, reopened]
-    branches: ["rag-33-debug", "RAG-33-31okt"]
+    branches: ["rag-33-debug", "RAG-33-31okt", "wip_3_12"]
     paths:
       - 'src/**'
       - 'tests/**'

From f7ff8e07f9292207a0d8d92698d964c17f577cf6 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 11:53:05 +0200
Subject: [PATCH 3/7] test vault files

---
 test-vault/agents/llm/agent.hcl | 45 +++++++++++++++++++++++++++++++++
 test-vault/agents/llm/role_id   |  0
 test-vault/agents/llm/secret_id |  0
 3 files changed, 45 insertions(+)
 create mode 100644 test-vault/agents/llm/agent.hcl
 create mode 100644 test-vault/agents/llm/role_id
 create mode 100644 test-vault/agents/llm/secret_id

diff --git a/test-vault/agents/llm/agent.hcl b/test-vault/agents/llm/agent.hcl
new file mode 100644
index 0000000..9883bfe
--- /dev/null
+++ b/test-vault/agents/llm/agent.hcl
@@ -0,0 +1,45 @@
+vault {
+  # Inside Docker network, the service name "vault" resolves to the dev Vault
+  address = "http://vault:8200"
+}
+
+pid_file = "/agent/out/pidfile"
+
+auto_auth {
+  method "approle" {
+    mount_path = "auth/approle"
+    config = {
+      role_id_file_path                   = "/agent/in/role_id"
+      secret_id_file_path                 = "/agent/in/secret_id"
+      remove_secret_id_file_after_reading = false  # test-friendly
+    }
+  }
+
+  sink "file" {
+    config = {
+      path = "/agent/out/token"
+    }
+  }
+}
+
+# In-memory cache (free, no Enterprise license)
+cache {
+  default_lease_duration = "1h"
+}
+
+# Listener is required for Agent’s internal servers (not exposed)
+listener "tcp" {
+  address     = "127.0.0.1:8201"
+  tls_disable = true
+}
+
+# dummy template so cache is “active” (some versions require this)
+template {
+  source      = "/dev/null"
+  destination = "/agent/out/dummy"
+}
+
+# Disable API proxy; not needed here
+api_proxy {
+  disable = true
+}
\ No newline at end of file
diff --git a/test-vault/agents/llm/role_id b/test-vault/agents/llm/role_id
new file mode 100644
index 0000000..e69de29
diff --git a/test-vault/agents/llm/secret_id b/test-vault/agents/llm/secret_id
new file mode 100644
index 0000000..e69de29

From 84a9342c3f348d1558889089168c09ed5d3f9130 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 11:59:28 +0200
Subject: [PATCH 4/7] removed variable from response

---
 src/llm_orchestration_service_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llm_orchestration_service_api.py b/src/llm_orchestration_service_api.py
index 30a703a..be8f0c7 100644
--- a/src/llm_orchestration_service_api.py
+++ b/src/llm_orchestration_service_api.py
@@ -777,7 +777,6 @@ def orchestrate_llm_request_eval(
             inputGuardFailed=response.inputGuardFailed,
             content=response.content,
             retrieval_context=response.retrieval_context,
-            refined_questions=response.refined_questions,
             expected_output=None,  # Will be populated by test framework
         )
 

From df83e69e89512ffe98c2b472ee4676b4aee05605 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 12:12:51 +0200
Subject: [PATCH 5/7] full error message display for deepeval

---
 tests/deepeval_tests/red_team_report_generator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/deepeval_tests/red_team_report_generator.py b/tests/deepeval_tests/red_team_report_generator.py
index 77249ab..ed154f4 100644
--- a/tests/deepeval_tests/red_team_report_generator.py
+++ b/tests/deepeval_tests/red_team_report_generator.py
@@ -208,8 +208,7 @@ def generate_failed_attacks_analysis(results: Dict[str, Any]) -> str:
         )
         error = failure.get("error", "Test failed")
 
-        # Truncate long error messages
-        error_preview = error[:60] + "..." if len(error) > 60 else error
+        error_preview = error
 
         analysis += f"| {short_test_name} | {attack_type} | {vulnerability} | {language} | FAILED | {error_preview} |\n"
 

From e00c1ae3aad29af8ef6f6bef05f2336f7283b5b6 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Wed, 3 Dec 2025 12:22:16 +0200
Subject: [PATCH 6/7] import os fix

---
 src/llm_orchestration_service_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llm_orchestration_service_api.py b/src/llm_orchestration_service_api.py
index be8f0c7..2f7edee 100644
--- a/src/llm_orchestration_service_api.py
+++ b/src/llm_orchestration_service_api.py
@@ -1,5 +1,5 @@
 """LLM Orchestration Service API - FastAPI application."""
-
+import os
 from contextlib import asynccontextmanager
 from typing import Any, AsyncGenerator, Dict
 

From 965dab63af96e7fd2fd92f1db776a656a16b9885 Mon Sep 17 00:00:00 2001
From: ckittask <claudia.kittask@ut.ee>
Date: Mon, 5 Jan 2026 12:04:25 +0200
Subject: [PATCH 7/7] update with eval docker compose

---
 .github/workflows/deepeval-tests.yml          |   4 +-
 .github/workflows/deepteam-red-team-tests.yml |   4 +-
 .gitleaks.toml                                |   2 +-
 docker-compose-eval.yml                       | 290 ++++++++++++++++++
 tests/conftest.py                             |   2 +-
 5 files changed, 296 insertions(+), 6 deletions(-)
 create mode 100644 docker-compose-eval.yml

diff --git a/.github/workflows/deepeval-tests.yml b/.github/workflows/deepeval-tests.yml
index 53a6b70..7c33ad8 100644
--- a/.github/workflows/deepeval-tests.yml
+++ b/.github/workflows/deepeval-tests.yml
@@ -8,7 +8,7 @@ on:
       - 'src/**'
       - 'tests/**'
       - 'data/**'
-      - 'docker-compose-test.yml'
+      - 'docker-compose-eval.yml'
       - 'Dockerfile.llm_orchestration_service'
       - '.github/workflows/deepeval-tests.yml'
 
@@ -299,5 +299,5 @@ jobs:
       - name: Cleanup Docker resources
         if: always()
         run: |
-          docker compose -f docker-compose-test.yml down -v --remove-orphans || true
+          docker compose -f docker-compose-eval.yml down -v --remove-orphans || true
           docker system prune -f || true
\ No newline at end of file
diff --git a/.github/workflows/deepteam-red-team-tests.yml b/.github/workflows/deepteam-red-team-tests.yml
index 3c4d558..d51d12e 100644
--- a/.github/workflows/deepteam-red-team-tests.yml
+++ b/.github/workflows/deepteam-red-team-tests.yml
@@ -7,7 +7,7 @@ on:
       - 'src/**'
       - 'tests/**'
       - 'data/**'
-      - 'docker-compose-test.yml'
+      - 'docker-compose-eval.yml'
       - 'Dockerfile.llm_orchestration_service'
       - '.github/workflows/deepeval-red-team-tests.yml'
   workflow_dispatch:
@@ -336,5 +336,5 @@ jobs:
       - name: Cleanup Docker resources
         if: always()
         run: |
-          docker compose -f docker-compose-test.yml down -v --remove-orphans || true
+          docker compose -f docker-compose-eval.yml down -v --remove-orphans || true
           docker system prune -f || true
\ No newline at end of file
diff --git a/.gitleaks.toml b/.gitleaks.toml
index 87311e3..2960de6 100644
--- a/.gitleaks.toml
+++ b/.gitleaks.toml
@@ -1,4 +1,4 @@
 [allowlist]
 paths = [
-  '''docker-compose-test\.yml'''
+  '''docker-compose-eval\.yml'''
 ]
\ No newline at end of file
diff --git a/docker-compose-eval.yml b/docker-compose-eval.yml
new file mode 100644
index 0000000..2c6aadf
--- /dev/null
+++ b/docker-compose-eval.yml
@@ -0,0 +1,290 @@
+services:
+  # === Core Infrastructure ===
+  
+  # Shared PostgreSQL database (used by both application and Langfuse)
+  rag_search_db:
+    image: postgres:14.1
+    container_name: rag_search_db
+    restart: always
+    environment:
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: dbadmin
+      POSTGRES_DB: rag-search
+    volumes:
+      - test_rag_search_db:/var/lib/postgresql/data
+    ports:
+      - "5436:5432"
+    networks:
+      - test-network
+
+  # Vector database for RAG
+  qdrant:
+    image: qdrant/qdrant:v1.15.1
+    container_name: qdrant
+    restart: always
+    ports:
+      - "6333:6333"
+      - "6334:6334"
+    volumes:
+      - test_qdrant_data:/qdrant/storage
+    networks:
+      - test-network
+
+  # === Secret Management ===
+  
+  # Vault - Secret management (dev mode)
+  vault:
+    image: hashicorp/vault:1.20.3
+    container_name: vault
+    cap_add:
+      - IPC_LOCK
+    ports:
+      - "8200:8200"
+    environment:
+      VAULT_DEV_ROOT_TOKEN_ID: root
+      VAULT_ADDR: http://0.0.0.0:8200
+      VAULT_API_ADDR: http://0.0.0.0:8200
+    command: server -dev -dev-listen-address=0.0.0.0:8200
+    networks:
+      - test-network
+
+  # Vault Agent - Automatic token management via AppRole
+  vault-agent-llm:
+    image: hashicorp/vault:1.20.3
+    container_name: vault-agent-llm
+    depends_on:
+      - vault
+    volumes:
+      - ./test-vault/agents/llm:/agent/in
+      - ./test-vault/agent-out:/agent/out
+    entrypoint: ["sh", "-c"]
+    command:
+      - |
+        # Wait for Vault to be ready
+        sleep 5
+        echo "Waiting for AppRole credentials..."
+        while [ ! -f /agent/in/role_id ] || [ ! -s /agent/in/role_id ]; do
+          sleep 1
+        done
+        while [ ! -f /agent/in/secret_id ] || [ ! -s /agent/in/secret_id ]; do
+          sleep 1
+        done
+        echo "Credentials found, starting Vault Agent..."
+        exec vault agent -config=/agent/in/agent.hcl -log-level=debug
+    networks:
+      - test-network
+
+  # === Langfuse Observability Stack ===
+  
+  # Redis - Queue and cache for Langfuse
+  redis:
+    image: redis:7
+    container_name: redis
+    restart: always
+    command: --requirepass myredissecret
+    ports:
+      - "127.0.0.1:6379:6379"
+    networks:
+      - test-network
+
+  # MinIO - S3-compatible storage for Langfuse
+  minio:
+    image: minio/minio:latest
+    container_name: minio
+    restart: always
+    entrypoint: sh
+    command: -c "mkdir -p /data/langfuse && minio server /data --address ':9000' --console-address ':9001'"
+    environment:
+      MINIO_ROOT_USER: minio
+      MINIO_ROOT_PASSWORD: miniosecret
+    ports:
+      - "9090:9000"
+      - "127.0.0.1:9091:9001"
+    volumes:
+      - test_minio_data:/data
+    networks:
+      - test-network
+
+  # ClickHouse - Analytics database for Langfuse (REQUIRED in v3)
+  clickhouse:
+    image: clickhouse/clickhouse-server:24.3
+    container_name: clickhouse
+    restart: always
+    environment:
+      CLICKHOUSE_DB: default
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+    volumes:
+      - test_clickhouse_data:/var/lib/clickhouse
+    ports:
+      - "127.0.0.1:8123:8123"
+      - "127.0.0.1:9000:9000"
+    networks:
+      - test-network
+    ulimits:
+      nofile:
+        soft: 262144
+        hard: 262144
+
+  # Langfuse Worker - Background job processor
+  langfuse-worker:
+    image: langfuse/langfuse-worker:3
+    container_name: langfuse-worker
+    restart: always
+    depends_on:
+      - rag_search_db
+      - minio
+      - redis
+      - clickhouse
+    ports:
+      - "127.0.0.1:3030:3030"
+    environment:
+      # Database
+      DATABASE_URL: postgresql://postgres:dbadmin@rag_search_db:5432/rag-search
+      
+      # Auth & Security (TEST VALUES ONLY - NOT FOR PRODUCTION)
+      # gitleaks:allow - These are test-only hex strings
+      NEXTAUTH_URL: http://localhost:3000
+      SALT: ef9d6c6f8b4a5e2c1d3f7a9b8c5e4d2a1f6b8c9d4e5f7a8b1c2d3e4f5a6b7c8d
+      ENCRYPTION_KEY: 1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b
+      
+      # Features
+      TELEMETRY_ENABLED: "false"
+      LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: "false"
+      
+      # ClickHouse (REQUIRED for Langfuse v3)
+      CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000/default
+      CLICKHOUSE_URL: http://clickhouse:8123
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+      CLICKHOUSE_CLUSTER_ENABLED: "false"
+      
+      # S3/MinIO Event Upload
+      LANGFUSE_S3_EVENT_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_EVENT_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # S3/MinIO Media Upload
+      LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_MEDIA_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # Redis
+      REDIS_HOST: redis
+      REDIS_PORT: "6379"
+      REDIS_AUTH: myredissecret
+    networks:
+      - test-network
+
+  # Langfuse Web - UI and API
+  langfuse-web:
+    image: langfuse/langfuse:3
+    container_name: langfuse-web
+    restart: always
+    depends_on:
+      - langfuse-worker
+      - rag_search_db
+      - clickhouse
+    ports:
+      - "3000:3000"
+    environment:
+      # Database
+      DATABASE_URL: postgresql://postgres:dbadmin@rag_search_db:5432/rag-search
+      
+      # Auth & Security (TEST VALUES ONLY - NOT FOR PRODUCTION)
+      # gitleaks:allow - These are test-only hex strings
+      NEXTAUTH_URL: http://localhost:3000
+      NEXTAUTH_SECRET: 9f8e7d6c5b4a3f2e1d0c9b8a7f6e5d4c3b2a1f0e9d8c7b6a5f4e3d2c1b0a9f8e
+      SALT: ef9d6c6f8b4a5e2c1d3f7a9b8c5e4d2a1f6b8c9d4e5f7a8b1c2d3e4f5a6b7c8d
+      ENCRYPTION_KEY: 1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b
+      
+      # Features
+      TELEMETRY_ENABLED: "false"
+      LANGFUSE_ENABLE_EXPERIMENTAL_FEATURES: "false"
+      
+      # ClickHouse (REQUIRED for Langfuse v3)
+      CLICKHOUSE_MIGRATION_URL: clickhouse://clickhouse:9000/default
+      CLICKHOUSE_URL: http://clickhouse:8123
+      CLICKHOUSE_USER: default
+      CLICKHOUSE_PASSWORD: clickhouse
+      CLICKHOUSE_CLUSTER_ENABLED: "false"
+      
+      # S3/MinIO Event Upload
+      LANGFUSE_S3_EVENT_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_EVENT_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_EVENT_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_EVENT_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_EVENT_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_EVENT_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # S3/MinIO Media Upload
+      LANGFUSE_S3_MEDIA_UPLOAD_BUCKET: langfuse
+      LANGFUSE_S3_MEDIA_UPLOAD_REGION: us-east-1
+      LANGFUSE_S3_MEDIA_UPLOAD_ACCESS_KEY_ID: minio
+      LANGFUSE_S3_MEDIA_UPLOAD_SECRET_ACCESS_KEY: miniosecret
+      LANGFUSE_S3_MEDIA_UPLOAD_ENDPOINT: http://minio:9000
+      LANGFUSE_S3_MEDIA_UPLOAD_FORCE_PATH_STYLE: "true"
+      
+      # Redis
+      REDIS_HOST: redis
+      REDIS_PORT: "6379"
+      REDIS_AUTH: myredissecret
+      
+      # Initialize test project with known credentials
+      LANGFUSE_INIT_PROJECT_PUBLIC_KEY: pk-lf-test
+      LANGFUSE_INIT_PROJECT_SECRET_KEY: sk-lf-test
+    networks:
+      - test-network
+
+  # === LLM Orchestration Service ===
+  
+  llm-orchestration-service:
+    build:
+      context: .
+      dockerfile: Dockerfile.llm_orchestration_service
+    container_name: llm-orchestration-service
+    restart: always
+    ports:
+      - "8100:8100"
+    environment:
+      - VAULT_ADDR=http://vault:8200
+      - VAULT_TOKEN_FILE=/agent/out/token
+      - QDRANT_URL=http://qdrant:6333
+      - EVAL_MODE=true
+    volumes:
+      - ./src/llm_config_module/config:/app/src/llm_config_module/config:ro
+      - ./test-vault/agent-out:/agent/out:ro
+      - test_llm_orchestration_logs:/app/logs
+    depends_on:
+      - qdrant
+      - langfuse-web
+      - vault-agent-llm
+    networks:
+      - test-network
+
+# === Networks ===
+
+networks:
+  test-network:
+    name: test-network
+    driver: bridge
+
+# === Volumes ===
+
+volumes:
+  test_rag_search_db:
+    name: test_rag_search_db
+  test_qdrant_data:
+    name: test_qdrant_data
+  test_minio_data:
+    name: test_minio_data
+  test_clickhouse_data:
+    name: test_clickhouse_data
+  test_llm_orchestration_logs:
+    name: test_llm_orchestration_logs
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index c589376..b67d114 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -129,7 +129,7 @@ def get_secret(self, path: str) -> dict:
 class RAGStackTestContainers:
     """Manages test containers for RAG stack including Vault, Qdrant, Langfuse, and LLM orchestration service"""
 
-    def __init__(self, compose_file_name: str = "docker-compose-test.yml"):
+    def __init__(self, compose_file_name: str = "docker-compose-eval.yml"):
         self.project_root = Path(__file__).parent.parent
         self.compose_file_path = self.project_root / compose_file_name
         self.compose: Optional[DockerCompose] = None