From 3dc08d1600f288bc287ba4e550ebbb7d9a2609fb Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:33:27 -0500
Subject: [PATCH 1/8] docs(feat): add CSS styles for editorial annotations

Add three new CSS classes for inline documentation annotations:
- .internal-note: Purple styling for internal development notes
- .vapi-validation: Orange styling for questions requiring VAPI validation
- .claude-note: Green styling for implementation guidance notes

Each class includes automatic label injection via ::before pseudo-elements
and dark mode variants for readability.
---
 fern/assets/styles.css | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/fern/assets/styles.css b/fern/assets/styles.css
index 8f1df1b39..de8f6b3fa 100644
--- a/fern/assets/styles.css
+++ b/fern/assets/styles.css
@@ -282,4 +282,72 @@ html.dark button[data-highlighted] .fern-api-property-meta {
 /* Fix: Make subtitle white on Simulations pages in dark mode */
 :is(.dark) [id*="simulations"] .prose-p\:text-\(color\:--grayscale-a11\) :where(p):not(:where([class~=not-prose],[class~=not-prose] *)) {
   color: var(--grayscale-12) !important;
+}
+
+/* Internal Note Styles */
+.internal-note {
+  display: inline-block;
+  background-color: rgb(209, 184, 213) !important;
+  color: #6a1b9a !important;
+  border-radius: 3px;
+  font-size: 0.9em;
+  padding: 2px 6px;
+  margin: 2px 0;
+}
+
+.internal-note::before {
+  content: "[INTERNAL NOTE] ";
+  font-weight: 600;
+}
+
+.internal-note.todo::before {
+  content: "[TODO] ";
+}
+
+.internal-note.assumption::before {
+  content: "[ASSUMPTION] ";
+}
+
+/* Dark mode: Keep same colors for visibility */
+:is(.dark) .internal-note {
+  background-color: rgb(209, 184, 213) !important;
+  color: #6a1b9a !important;
+}
+
+/* VAPI Validation Styles */
+.vapi-validation {
+  display: inline-block;
+  background-color: #deedab !important;
+  color: #341a04 !important;
+  border-radius: 3px;
+  font-size: 0.9em;
+  padding: 2px 6px;
+  margin: 2px 0;
+}
+
+.vapi-validation::before {
+  content: "[VAPI VALIDATION NEEDED] ";
+  font-weight: 600;
+}
+
+/* Claude Validation Styles */
+.claude-note {
+  display: inline-block;
+  background-color: #cdefc1 !important;
+  color: #1e0485 !important;
+  border-radius: 3px;
+  font-size: 0.9em;
+  padding: 2px 6px;
+  margin: 2px 0;
+}
+
+.claude-note::before {
+  content: "[NOTES FOR CLAUDE] ";
+  font-weight: 600;
+}
+
+/* Dark mode: Maintain legibility */
+:is(.dark) .vapi-validation {
+  background-color: #856404 !important;
+  color: #fff3cd !important;
 }
\ No newline at end of file

From f29547daa7275e64e2cf2d3086a153a67fc77c3c Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:33:36 -0500
Subject: [PATCH 2/8] docs(feat): add observability guides navigation section

Add new "Guides" section under Observability with 7 pages:
- Framework (observability-framework.mdx, renamed from overview)
- Instrumentation
- Testing strategies
- Extraction patterns
- Monitoring & Operating
- Optimization workflows
- Production readiness

Removed Integration Limitations page from navigation.
---
 fern/docs.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/fern/docs.yml b/fern/docs.yml
index f3ffd0cba..a2c247ab6 100644
--- a/fern/docs.yml
+++ b/fern/docs.yml
@@ -285,6 +285,30 @@ navigation:
 
       - section: Observability
         contents:
+          - section: Guides
+            icon: fa-light fa-book
+            contents:
+              - page: Framework
+                path: observability/observability-framework.mdx
+                icon: fa-light fa-book-open
+              - page: Instrumentation
+                path: observability/instrumentation.mdx
+                icon: fa-light fa-wrench
+              - page: Testing strategies
+                path: observability/testing-strategies.mdx
+                icon: fa-light fa-vial
+              - page: Extraction patterns
+                path: observability/extraction-patterns.mdx
+                icon: fa-light fa-diagram-project
+              - page: Monitoring
+                path: observability/monitoring.mdx
+                icon: fa-light fa-chart-line
+              - page: Optimization workflows
+                path: observability/optimization-workflows.mdx
+                icon: fa-light fa-arrow-trend-up
+              - page: Production readiness
+                path: observability/production-readiness.mdx
+                icon: fa-light fa-check-circle
           - section: Evals
             icon: fa-light fa-clipboard-check
             contents:

From d88c7b34f06cd713f92489a72b1c993a36a5d431 Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:33:47 -0500
Subject: [PATCH 3/8] docs(feat): add observability framework guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add top-level framework guide introducing the observability maturity model
for voice AI assistants (renamed from overview.mdx).

Key sections:
- What is observability for voice AI
- Five-stage maturity model (INSTRUMENT → TEST → EXTRACT → MONITOR → OPTIMIZE)
- Stage descriptions with tool mapping
- Progressive adoption guidance
- Cross-stage workflow examples

Includes VAPI validation questions for framing and terminology.
---
 .../observability/observability-framework.mdx | 222 ++++++++++++++++++
 1 file changed, 222 insertions(+)
 create mode 100644 fern/observability/observability-framework.mdx

diff --git a/fern/observability/observability-framework.mdx b/fern/observability/observability-framework.mdx
new file mode 100644
index 000000000..aa6a4c4d4
--- /dev/null
+++ b/fern/observability/observability-framework.mdx
@@ -0,0 +1,222 @@
+---
+title: Observability framework
+subtitle: A systematic framework for building, testing, and improving voice AI assistants
+slug: observability/framework
+---
+
+## What is observability for voice AI?
+
+Observability for voice AI means **instrumenting your assistants to capture data**, **testing them before production**, **extracting insights from calls**, **monitoring operational health**, and **using that data to continuously improve**.
+
+Unlike traditional software observability (logs, metrics, traces), voice AI observability must account for:
+
+- **Conversational unpredictability** — Users say unexpected things, conversations diverge
+- **Multi-system complexity** — Speech recognition, language models, voice synthesis, telephony all working together
+- **Quality is subjective** — "Good" voice interactions are harder to measure than HTTP response codes
+- **Production is expensive** — Every production call costs money; finding bugs in production is costly
+
+**The challenge**: How do you know your voice assistant works correctly before deploying it? How do you detect problems in production? How do you improve based on real-world performance?
+
+**The solution**: A systematic observability strategy that moves you from "deploy and hope" to "test, monitor, and optimize."
+
+<span className="vapi-validation">Does this framing resonate with how VAPI thinks about observability? Are there other unique challenges for voice AI observability we should highlight?</span>
+
+---
+
+## Who should use this framework?
+
+This framework is for teams who:
+- Are building production voice AI assistants
+- Want to **test before deploying** (not debug in production)
+- Need to **prove quality** to stakeholders or customers
+- Are scaling from prototype to production
+- Want systematic continuous improvement
+
+If you're just experimenting or building a demo, you might not need the full framework yet - start with [Evals quickstart](/observability/evals-quickstart).
+
+---
+
+## The observability maturity model
+
+Vapi's observability tools support a 5-stage progression:
+
+```
+┌──────────────────────────────────────────────────────────────────┐
+│                                                                  │
+│  INSTRUMENT → TEST → EXTRACT → MONITOR → OPTIMIZE               │
+│       ↑                                              │           │
+│       │                                              │           │
+│       └──────────────── feedback loop ───────────────┘           │
+│                                                                  │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+<Note>
+  **Note**: This ASCII diagram will be replaced with a visual diagram in a future update.
+</Note>
+
+### This is a maturity progression, not a linear checklist
+
+You don't complete one stage and never return to it. Observability is **continuous**:
+
+- **Instrument** as you build new features
+- **Test** after every change
+- **Extract** based on evolving analytics needs
+- **Monitor** production constantly
+- **Optimize** based on what monitoring reveals → loop back to **Instrument** improvements
+
+**For teams just starting**: Begin with INSTRUMENT + TEST (validate before production). Add EXTRACT + MONITOR as you scale. OPTIMIZE becomes natural once you have data flowing.
+
+**For experienced teams**: You're likely already monitoring production. This framework helps systematize pre-production testing (TEST stage) and formalize continuous improvement (OPTIMIZE stage).
+
+<span className="vapi-validation">Is "maturity model" the right framing? Should we emphasize iteration more explicitly? How do customer segments (startups vs enterprises) typically progress through these stages?</span>
+
+---
+
+## How this framework maps to Vapi tools
+
+Each stage uses specific Vapi features. Here's a quick reference:
+
+### Stage 1: INSTRUMENT
+
+Configure your assistant to capture operational and business metrics.
+
+**What you'll use**: Built-in Instrumentation, Structured Outputs, Call Analysis
+
+→ **[Deep dive: Instrumentation guide](/observability/instrumentation)**
+
+---
+
+### Stage 2: TEST
+
+Validate your assistant works correctly before production deployment.
+
+**What you'll use**: Evals, Simulations, Test Suites
+
+→ **[Deep dive: Testing strategies](/observability/testing-strategies)**
+
+---
+
+### Stage 3: EXTRACT
+
+Choose your data extraction pattern based on technical capability and analytics needs.
+
+**What you'll use**: Boards, Scorecards, Insights API, Analytics API, Webhooks, Langfuse
+
+→ **[Deep dive: Extraction patterns](/observability/extraction-patterns)**
+
+---
+
+### Stage 4: MONITOR
+
+Visualize trends, track operational health, and catch problems early.
+
+**What you'll use**: Boards, Insights API, Analytics API
+
+→ **[Deep dive: Monitoring guide](/observability/monitoring)**
+
+---
+
+### Stage 5: OPTIMIZE
+
+Use observability data to continuously improve your assistant.
+
+**What you'll use**: Iterative workflow across all stages
+
+→ **[Deep dive: Optimization workflows](/observability/optimization-workflows)**
+
+---
+
+## Choosing your observability strategy
+
+### Start simple, scale systematically
+
+**If you're just getting started**:
+1. **INSTRUMENT** with basic Structured Outputs (scalar fields only)
+2. **TEST** with Evals (fast, cheap regression testing)
+3. Use **Dashboard Native** extraction pattern (Boards for monitoring)
+4. **OPTIMIZE** based on what Boards shows you
+
+**As you scale**:
+- Add Simulations for realistic pre-production testing
+- Migrate to **Hybrid** pattern (Boards + webhooks)
+- Implement programmatic alerting via Insights API
+- Build automated regression suites in CI/CD
+
+**For enterprises**:
+- Design comprehensive schemas (domain-segmented)
+- Use **Webhook-to-External** pattern (export to data warehouse)
+- Integrate with existing BI tools (Tableau, PowerBI)
+- Implement observability-driven development workflows
+
+### Common migration paths
+
+```
+Dashboard Native → Hybrid → Webhook-to-External
+```
+
+Most teams start with Dashboard Native (simple, no engineering required), add webhooks for specific analytics needs (Hybrid), and eventually move to full external integration (Webhook-to-External) as observability maturity increases.
+
+<Tip>
+  **Don't over-engineer early**. Start with the simplest pattern that meets your needs. You can always add complexity later. Premature optimization creates maintenance burden.
+</Tip>
+
+---
+
+## Next steps
+
+### Learn the framework stages
+
+<CardGroup cols={2}>
+  <Card
+    title="Instrumentation"
+    icon="wrench"
+    href="/observability/instrumentation"
+  >
+    Stage 1: Configure data capture
+  </Card>
+
+  <Card
+    title="Testing strategies"
+    icon="vial"
+    href="/observability/testing-strategies"
+  >
+    Stage 2: Validate before production
+  </Card>
+
+  <Card
+    title="Extraction patterns"
+    icon="diagram-project"
+    href="/observability/extraction-patterns"
+  >
+    Stage 3: Choose your data pipeline
+  </Card>
+
+  <Card
+    title="Monitoring"
+    icon="chart-line"
+    href="/observability/monitoring"
+  >
+    Stage 4: Track operational health
+  </Card>
+
+  <Card
+    title="Optimization workflows"
+    icon="arrow-trend-up"
+    href="/observability/optimization-workflows"
+  >
+    Stage 5: Continuously improve
+  </Card>
+</CardGroup>
+
+### Supporting resources
+
+<CardGroup cols={2}>
+  <Card
+    title="Production readiness"
+    icon="check-circle"
+    href="/observability/production-readiness"
+  >
+    Validate you're ready to deploy
+  </Card>
+</CardGroup>

From e96b0443eaa07dfb58975735705d5eb95b865565 Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:33:59 -0500
Subject: [PATCH 4/8] docs(feat): add instrumentation and testing strategy
 guides

Add two guides covering the "build & validate" stages:

**Instrumentation guide:**
- Built-in vs custom instrumentation concepts
- Purpose and intended outcomes for each type
- Tools at a glance (Built-in, Structured Outputs, Call Analysis)
- When to use each instrumentation approach

**Testing strategies guide:**
- Voice AI testing challenges
- Tools comparison (Evals vs Simulations vs Test Suites)
- Testing pyramid for voice AI
- Recommended hybrid testing strategy

Both pages use skeleton format with full prose intros and placeholder
sections for detailed content, pending VAPI validation.
---
 fern/observability/instrumentation.mdx    | 291 ++++++++++++++++++++++
 fern/observability/testing-strategies.mdx | 177 +++++++++++++
 2 files changed, 468 insertions(+)
 create mode 100644 fern/observability/instrumentation.mdx
 create mode 100644 fern/observability/testing-strategies.mdx

diff --git a/fern/observability/instrumentation.mdx b/fern/observability/instrumentation.mdx
new file mode 100644
index 000000000..9f7863ca2
--- /dev/null
+++ b/fern/observability/instrumentation.mdx
@@ -0,0 +1,291 @@
+---
+title: Instrumentation
+subtitle: Configure your assistant to capture operational and business metrics
+slug: observability/instrumentation
+---
+
+## What is instrumentation?
+
+**Instrumentation** means configuring your assistant to capture data during and after calls. Before you can test, monitor, or optimize, you need to define **what data to capture**.
+
+## Two types of instrumentation
+
+Like AWS provides basic server metrics (CPU, memory, network) but requires you to add custom metrics for business logic, **Vapi provides built-in operational instrumentation but requires you to configure business-specific instrumentation**.
+
+### Built-in (automatic) instrumentation
+
+**Purpose**: Track technical operation and system health.
+
+Vapi automatically captures technical and operational data for every call - duration, cost, transcripts, tool calls, and performance metrics. This data helps you understand **how the system is performing** from an infrastructure perspective.
+
+You get this data with zero configuration. No setup required.
+
+### Custom (user-defined) instrumentation
+
+**Purpose**: Track business outcomes and domain-specific metrics.
+
+Custom instrumentation (via Structured Outputs and Call Analysis) lets you define **what success means for your use case**. Did the assistant book an appointment? Collect customer information? Resolve the user's question? These outcomes are specific to your business logic, so you must explicitly configure what to capture.
+
+Custom instrumentation requires you to define schemas (via Structured Outputs) that tell Vapi what business data to extract from each call.
+
+### The key decision
+
+**What business metrics do you need beyond Vapi's automatic operational data?**
+
+Think of instrumentation as installing sensors in your assistant:
+- What information do you need to evaluate call success?
+- What metrics will help you debug failures?
+- What data do you need for compliance or reporting?
+
+The "Instrumentation tools at a glance" section below shows how to configure custom instrumentation.
+
+---
+
+## Instrumentation tools at a glance
+
+| Tool                         | What it does                                                                                                                         | Configuration                                              |
+| ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------- |
+| **Built-in Instrumentation** | Automatic capture of call metadata (duration, cost, timestamps), transcripts, messages, tool calls, operational metrics.             | ✅ Automatic - no configuration needed                     |
+| **Structured Outputs**       | AI-powered data extraction using JSON Schema. Define custom schemas for customer info, call outcomes, sentiment analysis, summaries. | ⚙️ Configure schemas on assistant                          |
+| **Call Analysis**            | Legacy feature for generating call summaries using AnalysisPlan configuration.                                                       | ⚠️ Legacy (use Structured Outputs for new implementations) |
+
+
+<span className="vapi-validation">Confirm that this list of VAPI features mapped to Instrumentation phase is correct.</span>
+
+---
+
+## Built-in Instrumentation
+
+### What Vapi captures automatically
+
+Every call automatically includes:
+
+- **Call metadata**: Duration, start/end timestamps, cost breakdown
+- **Transcripts**: Full conversation text with speaker labels
+- **Messages**: Array of all messages exchanged during the call
+- **Tool calls**: Record of function calls, parameters, and responses
+- **Operational metrics**: Latency, token usage, provider information
+
+### Accessing built-in data
+
+**Via Dashboard**:
+
+- Navigate to Calls > [Call ID] to view full transcript and metadata
+- Use Boards to query operational metrics (cost, duration, volume)
+
+**Via API**:
+
+- Call object includes all metadata, transcript, messages, tool calls
+- Analytics API provides aggregated operational metrics
+- Webhooks deliver complete call data to your endpoints
+
+### When built-in is sufficient
+
+Built-in instrumentation covers:
+
+- ✅ Operational monitoring (cost, latency, volume)
+- ✅ Debugging (transcripts, tool call traces)
+- ✅ Basic analytics (call patterns, duration trends)
+
+Built-in does NOT cover:
+
+- ❌ Business metrics (booking created, lead quality, intent classification)
+- ❌ Custom quality scores (sentiment, CSAT, success evaluation)
+- ❌ Extracted customer data (name, phone, appointment details)
+
+**Rule of thumb**: If you need to ask "Did this call achieve its business goal?", you need custom instrumentation.
+
+---
+
+## Structured Outputs
+
+### What it does
+
+**Structured Outputs** uses AI to extract and analyze data from calls based on JSON Schema definitions you provide.
+
+**Key differentiators**:
+
+- Uses JSON Schema for flexible, custom data extraction
+- AI-powered analysis (can evaluate sentiment, satisfaction, success)
+- Access to full transcript, messages, tool calls, and metadata
+- Post-call automatic execution with validation
+
+**When to use**:
+
+- You need custom business metrics (booking created, lead qualified, intent classification)
+- You want AI to evaluate quality (sentiment, CSAT, issue resolution)
+- You need extracted customer data (name, contact info, preferences)
+- You want structured, queryable data (not just summaries)
+
+**Capabilities**:
+
+- Extract customer information for CRM integration
+- Evaluate call outcomes (success, booking, escalation needed)
+- Measure quality metrics (sentiment, satisfaction, resolution)
+- Generate structured summaries with specific data points
+
+**Limitations**:
+
+- Requires JSON Schema knowledge
+- Schema design affects downstream queryability (see [Extraction patterns](/observability/extraction-patterns))
+- AI extraction accuracy depends on call quality and schema clarity
+
+**When NOT to use**:
+
+- You only need simple call summaries (consider Call Analysis)
+- Built-in operational metrics are sufficient
+
+→ **[Configure Structured Outputs: Quickstart](/assistants/structured-outputs-quickstart)**
+→ **[See schema examples and patterns](/assistants/structured-outputs-examples)**
+
+---
+
+## Call Analysis
+
+### What it does
+
+**Call Analysis** generates AI-powered call summaries and evaluations using AnalysisPlan configuration.
+
+**Key differentiators**:
+
+- Simpler configuration than Structured Outputs (predefined templates)
+- Focused on summaries and success evaluation
+- Text-based output (not structured data)
+
+**When to use**:
+
+- You need basic call summaries
+- Simple success/failure evaluation is sufficient
+- You don't need structured, queryable data
+
+**Capabilities**:
+
+- Generate call summaries
+- Evaluate call success
+- Extract structured data using predefined formats
+
+**Limitations**:
+
+- Less flexible than Structured Outputs (predefined templates vs custom schemas)
+- Text-based output (harder to query programmatically)
+- Limited to summary/evaluation use cases
+
+**When NOT to use**:
+
+- You need custom structured data extraction → use Structured Outputs
+- You need queryable metrics for dashboards → use Structured Outputs
+- You need complex, domain-specific analysis → use Structured Outputs
+
+<span className="vapi-validation">Confirm Call Analysis status - is this truly legacy/deprecated, or actively supported alongside Structured Outputs? This will help to position accordingly.</span>
+
+→ **[Configure Call Analysis](/assistants/call-analysis)**
+
+---
+
+<Note>
+  **Instrumentation happens at assistant configuration time**, not during the
+  call. You define your schemas and analysis plans when building your assistant.
+  The actual data extraction happens automatically after each call.
+</Note>
+
+---
+
+## Instrumentation best practices
+
+### Start simple, iterate
+
+**Phase 1 - Basic business metrics**:
+
+- Track call success (did we achieve the goal?)
+- Capture essential customer info (name, contact details)
+
+**Phase 2 - Add quality metrics as you scale**:
+
+- Measure sentiment and satisfaction
+- Categorize issues or intents
+- Track resolution outcomes
+
+**Phase 3 - Domain-specific analysis**:
+
+- Add compliance tracking
+- Industry-specific categorization
+- Advanced quality scoring
+- Multi-dimensional analysis
+
+→ **[See implementation examples in Structured Outputs guide](/assistants/structured-outputs-examples)**
+
+### Design for your extraction pattern
+
+Your instrumentation choices affect downstream observability:
+
+**Dashboard Native** (Boards-only monitoring):
+
+- Design for queryability and visualization
+- Consider what metrics you'll want to chart
+
+**Webhook-to-External** (external analytics platforms):
+
+- Design for your target system's data model
+- Full flexibility in data structure
+
+**Hybrid** (both Boards and external):
+
+- Balance queryable operational metrics with rich analytical data
+
+→ **[Choose your extraction pattern](/observability/extraction-patterns)** before finalizing instrumentation design
+
+### Test your schemas
+
+Before deploying to production:
+
+1. Make 3-5 test calls covering common scenarios
+2. Verify structured outputs populate correctly
+3. Check data appears in Dashboard (if using Dashboard Native pattern)
+4. Validate extraction accuracy (does AI extract the right data?)
+
+---
+
+## What you'll learn in detailed guides
+
+- [Structured outputs quickstart](/assistants/structured-outputs-quickstart) — Set up your first structured output in 5 minutes
+- [Structured outputs examples](/assistants/structured-outputs-examples) — Real-world schemas for common use cases
+- [Call analysis](/assistants/call-analysis) — Legacy AnalysisPlan reference
+- [Extraction patterns](/observability/extraction-patterns) — Choose Dashboard Native, Webhook, or Hybrid approach
+
+---
+
+## Key takeaway
+
+**Instrument early**. The data you configure now determines what you can test, monitor, and optimize later. Missing instrumentation discovered in production means you're flying blind.
+
+Start with basic business metrics (call success, customer info), then add quality and domain-specific instrumentation as you scale.
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Structured outputs quickstart"
+    icon="database"
+    href="/assistants/structured-outputs-quickstart"
+  >
+    Set up your first custom instrumentation
+  </Card>
+
+<Card
+  title="Extraction patterns"
+  icon="diagram-project"
+  href="/observability/extraction-patterns"
+>
+  Choose your data extraction strategy
+</Card>
+
+  <Card
+    title="Testing strategies"
+    icon="vial"
+    href="/observability/testing-strategies"
+  >
+    Next stage: Validate your instrumented assistant
+  </Card>
+</CardGroup>
diff --git a/fern/observability/testing-strategies.mdx b/fern/observability/testing-strategies.mdx
new file mode 100644
index 000000000..101de212f
--- /dev/null
+++ b/fern/observability/testing-strategies.mdx
@@ -0,0 +1,177 @@
+---
+title: Testing strategies
+subtitle: Validate your assistant works correctly before deploying to production
+slug: observability/testing-strategies
+---
+
+## Voice AI Testing Challenges
+
+**Testing** means validating your assistant works correctly **before deploying to production**. Voice AI testing prevents embarrassing failures, reduces production debugging costs, and builds confidence in your assistant.
+
+Unlike traditional software testing (unit tests, integration tests), voice AI testing must validate:
+
+- **LLM behavior** — Does the assistant prompt produce the right responses and reasoning?
+- **Routing logic** — Do squad handoffs, assistant transfers, and multi-turn flows work correctly?
+- **Tool orchestration** — Do function calls happen at the right time with the right parameters?
+- **Edge cases** — How does the system handle interruptions, unclear requests, or unexpected inputs?
+- **Regression** — Do changes break existing functionality?
+
+<span className="vapi-validation">What other specific validation and/or testing uniqueness have clients reported when working with voice AI testing?</span>
+
+---
+
+## Testing tools at a glance
+
+| Tool                          | What it does                                                                            | Best for                                                                         |
+| ----------------------------- | --------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
+| **Evals**                     | Text-based mock conversations for testing assistant logic. Fast, cheap, deterministic.  | Regression testing, CI/CD integration, rapid iteration during development        |
+| **Simulations (Pre-release)** | AI-powered realistic callers (voice + chat). Slower, more expensive, non-deterministic. | Pre-production validation, voice quality testing, realistic end-to-end scenarios |
+| **Test Suites**               | Legacy testing feature (voice and chat modes).                                          | ⚠️ Deprecated - migrate to Evals/Simulations                                     |
+
+<span className="vapi-validation">Confirm that these are all the current VAPI tools that fit under testing phase of Observability?</span>
+
+---
+
+## Testing pyramid for voice AI
+
+Visual representation of test distribution:
+
+```
+        /\
+       /  \     E2E Tests (Simulations)
+      /----\    1-2 critical journeys
+     /      \   Full workflow + voice quality
+    /--------\
+   /          \ Integration Tests
+  /------------\ 3-5 cross-component flows
+ /              \ Handoffs, transfers, tool chains
+/________________\
+                  Unit Tests (Evals)
+                  20-50+ specific scenarios
+                  Logic, routing, edge cases
+```
+<span className="internal-note">This ascii diagram will be replaced with a real diagram once page edits are approved</span>
+
+**Key principle**: Many fast, cheap unit tests at the base; few expensive, realistic E2E tests at the top.
+
+### Test type hierarchy
+
+**Unit tests** (use Evals):
+
+- Test single-assistant logic in isolation
+- Examples: Qualification flow, data validation, routing decisions
+- Run frequently (every commit)
+
+**Integration tests** (use Evals for logic validation, Simulations for voice quality):
+
+- Test interactions between components
+- Examples: Assistant handoffs, context passing, tool call chains
+- Use Evals for component interaction logic; use Simulations when voice quality matters
+- Run before deployment
+
+**End-to-end tests** (use Simulations):
+
+- Test complete user journeys with voice quality
+- Examples: Full conversation from greeting to confirmation
+- Run before major releases
+
+---
+
+## Evals vs Simulations: When to use which?
+
+**The key question**: Do you need realistic conversation variability, or do you need fast, deterministic validation?
+
+| Dimension                | Evals                                                  | Simulations                                                                        |
+| ------------------------ | ------------------------------------------------------ | ---------------------------------------------------------------------------------- |
+| **Realism**              | Scripted conversations - you write exact user messages | AI-driven variability - you define a caller persona that adapts like a real person |
+| **Speed**                | ⚡ Fast - text-only, no TTS/STT overhead               | 🐌 Slower - specially if using full voice pipeline (more performant in chat mode)                                     |
+| **Test duration**        | 3-10 seconds per test                                  | 30-120 seconds per test                                                            |
+| **Feedback loop**        | Immediate (results in seconds)                         | Delayed (setup + execution + processing takes minutes)                             |
+| **Cost**                 | 💰 Low - fewer LLM tokens (no simulated caller)        | 💰💰 Higher - 2x LLM usage (caller + assistant)                                    |
+| **Determinism**          | ✅ Deterministic - same input = same test              | ❌ Non-deterministic - AI caller persona varies responses                                  |
+| **CI/CD integration**    | ✅ Great fit - fast, predictable, easy to assert       | ⚠️ Harder - variability makes assertions difficult                                 |
+| **Voice testing**        | ❌ Text-only (no TTS/STT validation)                   | ✅ Full voice pipeline (pronunciation, interruptions, latency)                     |
+| **Coverage granularity** | Specific scenarios (one behavior per test)             | Full user journeys (multiple behaviors)                                            |
+| **Parallelization**      | Unlimited parallel execution                           | Limited by rate limits + cost                                                      |
+| **Setup effort**         | 📝 Manual - write conversation scripts                 | 🤖 AI-assisted - describe scenario, AI generates conversation                      |
+| **Regression testing**   | ✅ Ideal - catch exact breakages                       | ⚠️ Less ideal - variability can mask or false-flag issues                          |
+| **Exploratory testing**  | ❌ Limited - scripted paths only                       | ✅ Excellent - AI explores unexpected paths                                        |
+
+<span className="vapi-validation">Do we keep emojis in the table or remove them?</span>
+
+---
+
+## Recommended testing strategy
+
+Use a **hybrid approach** that leverages both tools:
+
+1. **Develop with Evals** - Fast feedback, cheap to run, great for iteration
+2. **Validate with Simulations** - Before production, run realistic scenarios
+3. **Regression suite** - Maintain Evals that run on every deployment
+4. **Quarterly exploratory testing** - Run Simulations to discover edge cases, add learnings to Evals suite
+
+<span className="vapi-validation">Confirm this strategy aligns with VAPI's recommendations. Are there specific use cases where this breaks down?</span>
+
+---
+
+## What you'll learn in detailed guides
+
+- [Evals quickstart](/observability/evals-quickstart) — Run your first evaluation in 5 minutes
+- [Evals advanced](/observability/evals-advanced) — Advanced testing strategies, CI/CD integration
+- [Simulations quickstart](/observability/simulations-quickstart) — Set up realistic voice/chat testing
+- [Simulations advanced](/observability/simulations-advanced) — Tool mocks, hooks, complex scenarios
+
+<Warning>
+  **Simulations is currently in pre-release**. API stability and feature
+  availability may change. Check with Vapi support for current status and GA
+  timeline.
+</Warning>
+
+<span className="vapi-validation">Legacy Test Suites (voice and chat testing) have been replaced by Evals and Simulations. If you're using Test Suites, migrate to Evals (text-based) or Simulations (voice testing). Confirm deprecation status and migration path.</span>
+
+
+---
+
+## Key takeaway
+
+**Test before production**. Finding bugs in production with real customers is expensive and embarrassing. A systematic testing strategy catches issues early when they're cheap to fix.
+
+Use Evals for fast iteration and regression testing. Use Simulations for realistic pre-production validation.
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Evals quickstart"
+    icon="clipboard-check"
+    href="/observability/evals-quickstart"
+  >
+    Build your first test suite
+  </Card>
+
+<Card
+  title="Simulations quickstart"
+  icon="flask-vial"
+  href="/observability/simulations-quickstart"
+>
+  Run realistic voice testing
+</Card>
+
+<Card
+  title="Extraction patterns"
+  icon="diagram-project"
+  href="/observability/extraction-patterns"
+>
+  Next stage: Choose your data extraction strategy
+</Card>
+
+  <Card
+    title="Back to overview"
+    icon="arrow-left"
+    href="/observability/framework"
+  >
+    Return to observability framework
+  </Card>
+</CardGroup>

From 8d6dc9dab58dcc3cf819b569a24dba3c23686ebb Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:34:13 -0500
Subject: [PATCH 5/8] docs(feat): add extraction patterns and production
 readiness guides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two guides covering data extraction architecture and deployment validation:

**Extraction patterns guide:**
- Three extraction patterns at a glance (Dashboard Native, Webhook-to-External, Hybrid)
- Pattern descriptions with architectural trade-offs
- Feature mapping to patterns (Structured Outputs, Scorecards, APIs, Langfuse)
- When to use each pattern
- Schema design implications
- Migration paths between patterns

**Production readiness guide:**
- Progressive validation approach (INSTRUMENT+TEST → EXTRACT+MONITOR → OPTIMIZE)
- Stage-by-stage checklist with required and recommended items
- Production readiness gates (first deploy, scaled deploy, mature observability)
- Common readiness mistakes and fixes
- Deployment workflow timeline

Includes VAPI validation questions throughout for pattern accuracy and naming consistency.
---
 fern/observability/extraction-patterns.mdx  | 364 ++++++++++++++++++
 fern/observability/production-readiness.mdx | 406 ++++++++++++++++++++
 2 files changed, 770 insertions(+)
 create mode 100644 fern/observability/extraction-patterns.mdx
 create mode 100644 fern/observability/production-readiness.mdx

diff --git a/fern/observability/extraction-patterns.mdx b/fern/observability/extraction-patterns.mdx
new file mode 100644
index 000000000..9df861e8e
--- /dev/null
+++ b/fern/observability/extraction-patterns.mdx
@@ -0,0 +1,364 @@
+---
+title: Choosing your extraction pattern
+subtitle: Understand the three architectural patterns for getting data out of Vapi
+slug: observability/extraction-patterns
+---
+
+## Why extraction is an architectural choice
+
+Unlike traditional observability platforms (DataDog, New Relic) where data flows automatically from instrumentation to monitoring, **Vapi requires you to choose how data gets extracted** for analysis.
+
+This design reflects Vapi's architecture:
+
+- **Scalar Structured Outputs** (strings, numbers, booleans) flow automatically to Boards and Insights API
+- **Object Structured Outputs** (nested data) require webhook extraction
+- **Scorecard results** don't appear in native analytics (webhook-only)
+
+**Your extraction pattern choice determines**:
+
+- What schema types you can use (scalar vs object fields)
+- What tools you can use for monitoring (Boards vs external BI)
+- How much engineering effort is required
+- Whether you can export to existing data infrastructure
+
+<span className="vapi-validation">Confirm this framing is accurate and doesn't oversimplify</span>
+
+---
+
+## The three extraction patterns at a glance
+
+Vapi offers three architectural patterns for extracting observability data from your calls. Each pattern represents a different trade-off between simplicity and flexibility:
+
+| Pattern | Description | Engineering effort | Data richness | Typical users |
+|---------|-------------|-------------------|---------------|---------------|
+| **Dashboard Native** | Use Vapi's built-in Boards with scalar Structured Outputs for real-time dashboards | ⚡ Minimal (no infrastructure) | Basic (scalar fields only) | Solo founders, non-technical teams, startups |
+| **Webhook-to-External** | Build custom post-call processing that captures data via webhooks and exports to your data warehouse | 🛠️ High (requires backend infrastructure) | Rich (full object schemas, nested data) | Engineering teams, enterprises with existing data platforms |
+| **Hybrid** | Combine both approaches - use Boards for operational metrics, webhooks for deep analysis | ⚙️ Medium (partial infrastructure) | Flexible (mix of scalar and object data) | Growing teams balancing simplicity and power |
+
+**How to choose**: Start with Dashboard Native (fastest setup). Migrate to Hybrid or Webhook-to-External as your analytics needs grow or when you need features like Scorecard visualization or external BI tools.
+
+---
+
+## EXTRACT stage features at a glance
+
+| Feature                         | What it extracts                                               | Extraction method                                                | Pattern compatibility       |
+| ------------------------------- | -------------------------------------------------------------- | ---------------------------------------------------------------- | --------------------------- |
+| **Structured Outputs (Scalar)** | Business metrics using scalar fields (individual boolean, strings, numbers) | Automatic → Boards + Insights API                                | Dashboard Native, Hybrid    |
+| **Structured Outputs (Object)** | Rich nested data using object/array schemas                    | Webhooks only                                                    | Webhook-to-External, Hybrid |
+| **Scorecards**                  | AI-powered quality evaluation results                          | Webhooks only (not visible in Boards)                            | Webhook-to-External, Hybrid |
+| **Insights API**                | [TBD: What does Insights API extract/provide?]                 | [TBD: Automatic for scalars? Separate feature?]                  | [TBD]                       |
+| **Analytics API**               | [TBD: What does Analytics API extract/provide?]                | [TBD: How does it differ from Insights API?]                     | [TBD]                       |
+| **Langfuse Integration**        | Real-time observability data to external platform              | Direct integration (real-time, no webhooks/post-call processing) | All patterns                |
+
+
+<span className="vapi-validation">Confirm this list is complete and accurate? Need help explaining and contrasting Insights API and Analytics API. Are you ok with having Langfuse be included here in extraction phase or should we only mention in monitoring phase?</span>
+
+---
+
+## The three extraction patterns
+
+### Pattern 1: Dashboard Native
+
+**What it is**: This pattern uses Vapi's built-in Boards platform to automatically visualize scalar Structured Outputs (strings, numbers, booleans) without any external infrastructure. Data flows from your assistant configuration directly to Boards, where you can build real-time dashboards using a drag-and-drop visual builder.
+
+<span className="vapi-validation">Validate that Structured Outputs (scalar) are the only instrumentation that will work with native Vapi Boards</span>
+
+**Architecture**: Structured Outputs (scalar only) → Boards
+
+**Who it's for**:
+
+- Non-technical teams or solo founders
+- Teams without backend engineering resources
+- Startups with simple analytics needs
+- Quick operational dashboards (call volume, cost, success rate)
+
+**How it works**:
+
+1. Configure Structured Outputs using **scalar fields only** (no nested objects)
+2. Data automatically flows to Vapi Boards
+3. Build dashboards using drag-and-drop visual builder
+4. Monitor via Boards web interface
+
+**Capabilities**:
+
+- ✅ Real-time dashboards with no code
+- ✅ Built-in formulas and aggregations (Math.js)
+- ✅ Global filters and time range controls
+- ❌ Can't export to external BI tools (Tableau, PowerBI)
+- ❌ Can't use object-type schemas (limits extraction richness)
+- ❌ Can't visualize Scorecard results
+
+**When to use**:
+
+- You're just starting with observability
+- You don't have engineering resources for webhook infrastructure
+- Your analytics needs are simple (operational metrics, not complex business intelligence)
+- You need visibility fast with minimal setup
+
+**When NOT to use**:
+
+- You need to export to external BI tools (Tableau, PowerBI, Looker) → use **Webhook-to-External**
+- You're using Scorecards for quality monitoring (results not visible in Boards) → use **Webhook-to-External** or **Hybrid**
+- Compliance requires data sovereignty or custom retention → use **Webhook-to-External**
+- You need rich nested data schemas (objects, arrays) → use **Webhook-to-External** or **Hybrid**
+
+**Example use case**: A solopreneur running an AI receptionist for their dental practice. Wants to track: daily call volume, booking rate, missed calls. Uses Boards to see trends and spot issues.
+
+<span className="vapi-validatoin">Pay close attention to this section because a number of assumptions are being made.  Corrections and disambiguation needed.</span>
+
+---
+
+### Pattern 2: Webhook-to-External
+
+**What it is**: This pattern uses Vapi's webhook functionality to send post-call data to a custom endpoint you build and host. You configure a webhook URL at the org, squad, or assistant level, and Vapi sends complete call data (including object-type Structured Outputs and Scorecard results) to your server after each call, where you can process and store it in your data warehouse.
+
+<span className="vapi-validation">Naming consistency question: We've used "webhook", "webhook-to-external", and "Webhook-to-External" throughout the docs. Should we standardize on one name for this pattern? Recommendation: "Webhook-to-External" (capitalized, hyphenated) to parallel "Dashboard Native". Confirm preferred naming.</span>
+
+**Architecture**: Structured Outputs (any type) → Webhooks → Your data warehouse → Your BI tools
+
+**Who it's for**:
+
+- Engineering teams with data infrastructure
+- Enterprises with existing analytics platforms
+- Teams needing custom business intelligence
+- Organizations requiring data sovereignty or compliance
+
+**How it works**:
+
+1. Configure Structured Outputs using **rich object schemas** (nested data, arrays, complex types)
+2. Set up webhook endpoint on your servers to receive call data
+3. Process webhooks and store in your data warehouse (BigQuery, Snowflake, Postgres)
+4. Connect BI tools (Tableau, Looker, Metabase) to your warehouse
+5. Build custom analytics on your infrastructure
+
+**Capabilities**:
+
+- ✅ Full control over data storage and processing
+- ✅ Integration with existing BI and alerting systems
+- ✅ Rich nested data schemas (not limited to scalars)
+- ✅ Can access Scorecard results via webhooks
+- ❌ Requires backend engineering (webhook receiver, database, ETL)
+- ❌ Higher operational complexity (hosting, monitoring webhooks)
+
+**When to use**:
+
+- You have engineering resources to build webhook infrastructure
+- You need to integrate Vapi data with existing business systems (CRM, data warehouse)
+- You require custom analytics beyond Vapi's built-in capabilities
+- Compliance or data sovereignty requires you to control data storage
+
+**When NOT to use**:
+
+- You have no backend engineering team or resources → use **Dashboard Native**
+- Your analytics needs are simple and Boards provides sufficient visibility → use **Dashboard Native**
+- You want to start simple and may add external integration later → use **Dashboard Native** or **Hybrid**
+- You need instant operational dashboards without warehouse ETL delays → consider **Hybrid** instead
+
+**Example use case**: An enterprise healthcare org using Vapi for patient intake. Needs to: sync extracted patient info to Epic EHR, analyze call quality trends in Tableau, alert on-call staff via PagerDuty. Uses webhooks to export all call data to Snowflake, then integrates downstream systems.
+
+---
+
+### Pattern 3: Hybrid
+
+**What it is**: This pattern combines Dashboard Native and Webhook-to-External approaches by maintaining two parallel data flows - scalar Structured Outputs go to Boards for real-time operational dashboards, while rich object schemas and Scorecard results are exported via webhooks to your external data warehouse. This allows operations teams to use Boards while analytics teams get full-fidelity data in external BI tools.
+
+**Architecture**:
+
+- **Operational track**: Scalar Structured Outputs → Boards (real-time dashboards)
+- **Analytics track**: Object Structured Outputs + Scorecards → Webhooks → External warehouse
+
+**Who it's for**:
+
+- Teams with some engineering resources
+- Organizations balancing simplicity and power
+- Teams iterating from simple to complex analytics
+- Use cases needing both real-time ops dashboards AND deep analysis
+
+**How it works**:
+
+1. Configure **two sets of Structured Outputs**:
+   - Scalar fields for operational metrics (cost, volume, basic success metrics)
+   - Object fields for rich analysis (full conversation context, detailed scoring)
+2. Scalar data flows to Boards for real-time visibility
+3. Object data + Scorecards exported via webhooks for deep analysis
+4. Operations team uses Boards, analytics team uses external BI
+
+**Capabilities**:
+
+- ✅ Best of both worlds: simple dashboards + powerful analytics
+- ✅ Incremental complexity (start with Boards, add webhooks later)
+- ✅ Team separation (ops uses Boards, analysts use BI tools)
+- ❌ More complex schema design (must plan for both tracks)
+- ❌ Partial engineering effort (still need webhook infrastructure)
+
+**When to use**:
+
+- You're scaling from simple to complex analytics needs
+- Different teams have different analytics requirements (ops vs analysts)
+- You want real-time operational visibility without waiting for warehouse ETL
+- You're not sure yet whether Boards alone will be sufficient long-term
+
+**When NOT to use**:
+
+- Your needs clearly fit one pattern—all simple (use **Dashboard Native**) or all complex (use **Webhook-to-External**)
+- You want to minimize schema design complexity → use single-pattern approach
+- Small team where everyone uses the same analytics tools → use **Dashboard Native** or **Webhook-to-External** consistently
+- You're confident Boards will never be sufficient → skip straight to **Webhook-to-External**
+
+**Example use case**: A growing SaaS company using Vapi for sales qualification calls. Sales ops team monitors daily metrics in Boards (call volume, booking rate). Data team exports full conversation analysis via webhooks to BigQuery for prompt optimization and quarterly business reviews.
+
+---
+
+{/* ## Decision framework: Choosing your pattern
+
+<Tabs>
+  <Tab title="By team capability">
+    | Capability | Recommended Pattern |
+    |------------|-------------------|
+    | No backend engineering | **Dashboard Native** |
+    | Backend team, no data warehouse | **Dashboard Native** (start here, migrate to Hybrid later) <span className="internal-note assumption"> Assumes backend teams without existing warehouse should start simple. Alternative: Could recommend Webhook-to-External with lightweight warehouse (Postgres) if team has capacity.</span> |
+    | Backend team + data warehouse | **Webhook-to-External** or **Hybrid** |
+    | Enterprise with existing BI stack | **Webhook-to-External** |
+  </Tab>
+
+<Tab title="By analytics needs">
+  | Need | Recommended Pattern | |------|-------------------| | Simple
+  operational metrics (volume, cost, success rate) | **Dashboard Native** | |
+  Need to export to Tableau/PowerBI/Looker | **Webhook-to-External** | |
+  Real-time ops + deep analysis | **Hybrid** | | Compliance requires data
+  control | **Webhook-to-External** | | Using Scorecards for quality monitoring
+  | **Webhook-to-External** or **Hybrid** (Scorecard results not in Boards) |
+</Tab>
+
+  <Tab title="By business context">
+    | Context | Recommended Pattern |
+    |---------|-------------------|
+    | Startup / MVP stage | **Dashboard Native** |
+    | Growing team (10-50 people) | **Hybrid** |
+    | Enterprise (50+ people) | **Webhook-to-External** or **Hybrid** |
+    | Must integrate with CRM/ERP | **Webhook-to-External** |
+    | Need instant visibility, minimal engineering | **Dashboard Native** |
+  </Tab>
+</Tabs>
+
+<span className="vapi-validation">Are these recommendations aligned with how VAPI sees
+  customer segments?</span>
+
+--- */}
+
+---
+
+
+## Common migration paths
+
+<span className="internal-note"> Are reverse migrations possible/recommended?
+  (Webhook-to-External → Hybrid or Hybrid → Dashboard Native)? Do teams ever
+  simplify their extraction approach, or is migration always toward more
+  complexity?</span>
+
+### Dashboard Native → Hybrid
+
+**When to migrate**: You need deeper analysis but want to keep operational dashboards
+
+**What changes**: Add object-type Structured Outputs + webhook infrastructure. Existing scalar outputs continue flowing to Boards.
+
+**Impact**: Minimal disruption—operations team keeps using Boards, analytics team gets external warehouse access.
+
+---
+
+### Hybrid → Webhook-to-External
+
+**When to migrate**: External warehouse becomes single source of truth, Boards no longer provide value
+
+**What changes**: Migrate all data extraction to webhooks, rebuild operational dashboards in external BI tool (Looker, Tableau, Metabase).
+
+**Impact**: Medium effort—requires dashboard migration, but unifies analytics platform.
+
+---
+
+### Dashboard Native → Webhook-to-External
+
+**When to migrate**: Compliance requirement, CRM integration, or sudden need for external data control
+
+**What changes**: Full replacement—redesign schemas for richness, build webhook infrastructure, rebuild all dashboards externally.
+
+**Impact**: High effort—complete platform migration, but necessary for regulatory or integration requirements.
+
+---
+
+## Schema design implications
+
+<span className="internal-note"> This section should probably be in Structured Outputs doc pages; not here.</span>
+
+Your extraction pattern choice **determines how you design Structured Output schemas** in the INSTRUMENT stage.
+
+### Dashboard Native: Scalar fields only
+
+**Constraint**: Only scalar types (boolean, string, number) flow to Boards. Nested objects are invisible to dashboards.
+
+**Design strategy**: Flatten nested data into scalar fields. For example:
+
+- ✅ `appointment_date` (string), `appointment_time` (string), `appointment_service` (string)
+- ❌ `appointment_details` (object with nested date/time/service)
+
+**Tradeoff**: Simpler schemas, but loses data structure richness.
+
+---
+
+### Webhook-to-External: Full schema flexibility
+
+**Freedom**: Use rich nested schemas—objects, arrays, complex types. Your data warehouse can query anything.
+
+**Design strategy**: Structure data naturally. Nested customer objects, conversation analysis arrays, quality metric hierarchies.
+
+**Tradeoff**: More expressive data model, but requires webhook infrastructure.
+
+---
+
+### Hybrid: Two-schema strategy
+
+**Operational track** (Boards): Scalar fields for real-time metrics (success rate, call volume, cost)
+
+**Analytics track** (Webhooks): Rich nested schemas for deep analysis (full conversation context, sentiment timelines, topic extraction)
+
+**Design strategy**: Duplicate key metrics across both schemas. Operational team gets instant visibility; analytics team gets comprehensive data.
+
+**Tradeoff**: Schema design complexity (must maintain two structures), but provides best of both worlds.
+
+→ **[See schema examples and design patterns in Structured Outputs guide](/assistants/structured-outputs-quickstart)**
+
+
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Structured outputs"
+    icon="database"
+    href="/assistants/structured-outputs-quickstart"
+  >
+    Learn how to instrument your assistant with schemas
+  </Card>
+
+<Card
+  title="Boards quickstart"
+  icon="chart-line"
+  href="/observability/boards-quickstart"
+>
+  Build your first dashboard (Dashboard Native pattern)
+</Card>
+
+<Card title="Back to overview" icon="arrow-left" href="/observability/framework">
+  Return to the observability maturity model
+</Card>
+
+  <Card
+    title="Production readiness"
+    icon="check-circle"
+    href="/observability/production-readiness"
+  >
+    Validate you're ready for production
+  </Card>
+</CardGroup>
diff --git a/fern/observability/production-readiness.mdx b/fern/observability/production-readiness.mdx
new file mode 100644
index 000000000..659d03197
--- /dev/null
+++ b/fern/observability/production-readiness.mdx
@@ -0,0 +1,406 @@
+---
+title: Production readiness checklist
+subtitle: Validate your voice AI assistant is ready for production deployment
+slug: observability/production-readiness
+---
+
+## What is production readiness?
+
+**Production readiness** means your voice assistant has been systematically validated across instrumentation, testing, extraction, and monitoring **before** you deploy it to handle real customer calls.
+
+Deploying without production readiness means:
+- ❌ Finding bugs with real customers (embarrassing, expensive)
+- ❌ No visibility into operational health (flying blind)
+- ❌ Can't measure success or improvement (no data-driven optimization)
+- ❌ Production debugging is reactive, not proactive
+
+**This checklist helps you avoid those problems** by ensuring you've instrumented, tested, and validated before launch.
+
+<span className="internal-note"> Does VAPI have internal prod readiness criteria we should align with?</span>
+
+---
+
+## How to use this checklist
+
+### Progressive validation
+
+You don't need to complete ALL stages before deploying. The checklist is **progressive**:
+
+1. **INSTRUMENT + TEST = Minimum viable production** — You must instrument and test before deploying
+2. **EXTRACT + MONITOR = Production-grade** — Add monitoring once you're handling real traffic
+3. **OPTIMIZE = Mature observability** — Continuous improvement based on data
+
+**For your first deployment**: Complete INSTRUMENT + TEST checklist items. Add EXTRACT + MONITOR within 1-2 weeks of production launch.
+
+---
+
+### Checklist format
+
+Each stage has:
+- ✅ **Required** items (must complete before considering this stage "done")
+- 🟡 **Recommended** items (nice-to-have, increases confidence)
+- 📊 **Validation** (how to verify the item is complete)
+
+---
+
+## Stage 1: INSTRUMENT ✓
+
+### Required items
+
+<Steps>
+  <Step title="Define Structured Outputs for key metrics">
+    - [ ] Call success/failure indicated (boolean or enum field)
+    - [ ] Customer information captured (name, contact info, or relevant identifiers)
+    - [ ] Business outcome tracked (booking created, question answered, escalation needed)
+
+    **Validation**: Run a test call, verify structured output populates in Dashboard
+  </Step>
+
+  <Step title="Choose appropriate field types">
+    - [ ] Scalar fields (string, number, boolean) for data you want queryable in Boards
+    - [ ] Object fields for rich data you'll extract via webhooks (if using Webhook or Hybrid pattern)
+    - [ ] Schema matches your chosen extraction pattern (see [Extraction patterns](/observability/extraction-patterns))
+
+    **Validation**: Review schema against extraction pattern requirements
+  </Step>
+
+  <Step title="Test instrumentation with sample calls">
+    - [ ] Make 3-5 test calls covering common scenarios
+    - [ ] Verify structured outputs populate correctly
+    - [ ] Confirm data appears in Dashboard (if using Dashboard Native or Hybrid pattern)
+
+    **Validation**: Check Dashboard > Calls tab, verify structured output values are correct
+  </Step>
+</Steps>
+
+### Recommended items
+
+- 🟡 Define Structured Outputs for edge cases (unclear requests, escalations, errors)
+- 🟡 Add sentiment analysis or CSAT scoring (if applicable)
+- 🟡 Capture metadata useful for debugging (tool calls, LLM reasoning, timestamps)
+
+<span className="internal-note"> Are there other common instrumentation patterns we should recommend?</span>
+
+---
+
+## Stage 2: TEST ✓
+
+### Required items
+
+<Steps>
+  <Step title="Create smoke tests (basic functionality)">
+    - [ ] Test happy path (successful completion of primary use case)
+    - [ ] Test at least 3 common conversation variants
+    - [ ] All smoke tests passing
+
+    **Validation**: Run Evals via Dashboard or API, verify 100% pass rate
+  </Step>
+
+  <Step title="Create regression tests for critical paths">
+    - [ ] Test core conversation flows end-to-end
+    - [ ] Test tool calls execute correctly (if assistant uses tools)
+    - [ ] Test edge cases discovered during development
+
+    **Validation**: Regression suite runs cleanly (all tests pass)
+  </Step>
+
+  <Step title="Validate assistant behavior matches requirements">
+    - [ ] Assistant responds appropriately to expected user inputs
+    - [ ] Tone and voice match brand guidelines
+    - [ ] Error handling works (unclear input, unexpected requests)
+
+    **Validation**: Manual review of test call transcripts
+  </Step>
+</Steps>
+
+### Recommended items
+
+- 🟡 Test interruption handling (user talks over assistant)
+- 🟡 Test multi-language support (if applicable)
+- 🟡 Run Simulations for realistic voice testing (pre-production validation)
+- 🟡 Test squad handoffs (if using multi-assistant architecture)
+
+<span className="internal-note"> Add link to testing strategy guide when available (Evals vs Simulations decision framework)</span>
+
+---
+
+## Stage 3: EXTRACT ✓
+
+### Required items
+
+<Steps>
+  <Step title="Confirm extraction pattern is implemented">
+    - [ ] **Dashboard Native**: Scalar Structured Outputs flowing to Boards ✅
+    - [ ] **Webhook-to-External**: Webhook endpoint receiving call data ✅
+    - [ ] **Hybrid**: Both Boards and webhooks working ✅
+
+    **Validation**: See pattern-specific validation below
+  </Step>
+
+  <Step title="Validate data is flowing correctly">
+    **If Dashboard Native**:
+    - [ ] Structured output fields appear in Boards > Calls view
+    - [ ] Can filter/query by structured output values
+
+    **If Webhook-to-External**:
+    - [ ] Webhook endpoint receives POST requests from Vapi
+    - [ ] Call data stored in your data warehouse
+    - [ ] Can query data in your BI tool
+
+    **If Hybrid**:
+    - [ ] Operational metrics (scalars) appear in Boards
+    - [ ] Rich analytics (objects) received via webhooks
+
+    **Validation**: Make test call, trace data from call → Boards/webhooks → storage
+  </Step>
+
+  <Step title="Scorecard configuration (if using quality scoring)">
+    - [ ] Scorecard rules defined and tested
+    - [ ] Webhook configured to receive Scorecard results (required — not available in Boards)
+    - [ ] Scorecard results validated against expected thresholds
+
+    **Validation**: Run test calls with known outcomes, verify Scorecard scores are correct
+  </Step>
+</Steps>
+
+### Recommended items
+
+- 🟡 Document extraction architecture for your team
+- 🟡 Set up data retention policies (how long to store call data)
+- 🟡 Test webhook failover/retry logic (if using webhooks)
+
+---
+
+## Stage 4: MONITOR ✓
+
+### Required items
+
+<Steps>
+  <Step title="Create operational dashboard (Boards or external BI)">
+    - [ ] Track call volume (calls per day/week)
+    - [ ] Track call success rate (% of calls achieving primary goal)
+    - [ ] Track cost ($/day or $/call)
+    - [ ] Track error rate (% of calls with failures/escalations)
+
+    **Validation**: Dashboard updates with real data after test calls
+  </Step>
+
+  <Step title="Define baseline metrics (what's 'normal'?)">
+    - [ ] Document expected call volume range
+    - [ ] Document expected success rate threshold (e.g., &gt;85% success)
+    - [ ] Document expected cost per call (for budget tracking)
+
+    **Validation**: Run production for 1-2 weeks, establish baseline from real data
+  </Step>
+
+  <Step title="Set up monitoring cadence">
+    - [ ] Assign team member(s) to check dashboards daily
+    - [ ] Define what metrics to watch (volume, success rate, cost)
+    - [ ] Define thresholds for concern (e.g., success rate drops below 80%)
+
+    **Validation**: Team demonstrates they can access and read dashboards
+  </Step>
+</Steps>
+
+### Recommended items
+
+- 🟡 Set up automated alerts (via Insights API or external alerting system)
+- 🟡 Create executive dashboard (high-level summary for leadership)
+- 🟡 Track additional metrics (call duration, abandonment rate, CSAT)
+
+---
+
+## Stage 5: OPTIMIZE ✓
+
+### Required items
+
+<Steps>
+  <Step title="Establish optimization workflow">
+    - [ ] Define how team reviews monitoring data (weekly? monthly?)
+    - [ ] Define how improvement hypotheses are formed (data-driven)
+    - [ ] Define how changes are tested before deployment (use Evals/Simulations)
+
+    **Validation**: Document the optimization workflow (how do we improve?)
+  </Step>
+
+  <Step title="Run first optimization iteration">
+    - [ ] Identify pattern from monitoring (e.g., high escalation rate)
+    - [ ] Extract detailed data (call transcripts, structured outputs)
+    - [ ] Form hypothesis for improvement
+    - [ ] Test improvement with Evals before deploying
+    - [ ] Deploy and verify metric improves
+
+    **Validation**: Complete one full optimization loop (identify → improve → deploy → verify)
+  </Step>
+
+  <Step title="Maintain regression test suite">
+    - [ ] Add new test cases for bugs found in production
+    - [ ] Update tests when assistant behavior changes
+    - [ ] Run regression suite before every deployment
+
+    **Validation**: Regression suite prevents known issues from reoccurring
+  </Step>
+</Steps>
+
+### Recommended items
+
+- 🟡 Track optimization impact over time (are we improving?)
+- 🟡 Document improvement history (what worked, what didn't)
+- 🟡 Share learnings across team (prompt patterns, common pitfalls)
+
+---
+
+## Production readiness gates
+
+Use these **gates** to decide if you're ready to progress:
+
+### Gate 1: Ready for FIRST production deployment?
+
+**Must complete**:
+- ✅ INSTRUMENT Stage (all required items)
+- ✅ TEST Stage (all required items)
+
+**Can deploy**: Yes — you have basic instrumentation and testing
+
+**Next step**: Launch with limited traffic, add EXTRACT + MONITOR within 1-2 weeks
+
+---
+
+### Gate 2: Ready for SCALED production deployment?
+
+**Must complete**:
+- ✅ INSTRUMENT Stage
+- ✅ TEST Stage
+- ✅ EXTRACT Stage (all required items)
+- ✅ MONITOR Stage (all required items)
+
+**Can scale**: Yes — you have visibility and can detect problems
+
+**Next step**: Increase traffic, begin OPTIMIZE Stage (continuous improvement)
+
+---
+
+### Gate 3: Production-grade observability maturity?
+
+**Must complete**:
+- ✅ All stages (INSTRUMENT → TEST → EXTRACT → MONITOR → OPTIMIZE)
+- ✅ At least one optimization iteration completed
+- ✅ Team trained on monitoring and improvement workflow
+
+**Maturity level**: Production-grade — observability is systematic, not ad-hoc
+
+**Next step**: Refine and iterate (observability is continuous)
+
+---
+
+## Common readiness mistakes
+
+### Mistake 1: Deploying without instrumentation
+
+**Symptom**: "We deployed but can't tell if it's working"
+
+**Impact**: No data to debug issues, optimize, or measure success
+
+**Fix**: Go back to INSTRUMENT stage, add Structured Outputs, redeploy
+
+---
+
+### Mistake 2: No regression testing
+
+**Symptom**: "Every change breaks something we fixed before"
+
+**Impact**: Quality degrades over time, user trust erodes
+
+**Fix**: Build regression test suite (Evals), run before every deployment
+
+---
+
+### Mistake 3: Over-engineering extraction before launch
+
+**Symptom**: "We're 3 weeks into building webhook infrastructure, haven't launched yet"
+
+**Impact**: Delayed launch, opportunity cost
+
+**Fix**: Start with Dashboard Native pattern (simple), add webhooks after validating product-market fit
+
+---
+
+### Mistake 4: Monitoring exists but nobody checks it
+
+**Symptom**: "We have dashboards but didn't notice success rate dropped 20% last week"
+
+**Impact**: Problems detected late (via customer complaints, not monitoring)
+
+**Fix**: Assign monitoring owner, set check cadence, define alert thresholds
+
+---
+
+### Mistake 5: No optimization workflow
+
+**Symptom**: "We have data but don't know how to improve"
+
+**Impact**: Assistant quality stagnates, competitors improve faster
+
+**Fix**: Define optimization workflow (weekly review → hypothesis → test → deploy), run first iteration
+
+---
+
+## Deployment workflow with readiness gates
+
+Recommended workflow for production deployment:
+
+```
+Week 1-2: Build assistant + INSTRUMENT + TEST
+  ↓
+Week 2: Gate 1 — Ready for first deployment?
+  ↓
+Week 3: Deploy to limited production traffic (10-20 calls/day)
+  ↓
+Week 3-4: Add EXTRACT + MONITOR (while limited traffic runs)
+  ↓
+Week 4: Gate 2 — Ready for scaled deployment?
+  ↓
+Week 5+: Scale traffic (100+ calls/day), begin OPTIMIZE
+  ↓
+Ongoing: Continuous optimization loop
+```
+
+**Key principle**: Don't over-engineer early. Launch with minimum readiness (Gate 1), add monitoring as you scale.
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Back to overview"
+    icon="arrow-left"
+    href="/observability/framework"
+  >
+    Return to the observability maturity model
+  </Card>
+
+  <Card
+    title="Extraction patterns"
+    icon="diagram-project"
+    href="/observability/extraction-patterns"
+  >
+    Choose your data extraction strategy
+  </Card>
+
+  <Card
+    title="Structured outputs"
+    icon="database"
+    href="/assistants/structured-outputs-quickstart"
+  >
+    Start instrumenting your assistant
+  </Card>
+
+  <Card
+    title="Evals quickstart"
+    icon="clipboard-check"
+    href="/observability/evals-quickstart"
+  >
+    Build your first test suite
+  </Card>
+</CardGroup>

From 214099eeaee45cffe4eb215055bc8408c1ee2055 Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:34:26 -0500
Subject: [PATCH 6/8] docs(feat): add monitoring and optimization workflow
 guides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two guides covering the "monitor & improve" stages (skeleton format):

**Monitoring & Operating guide:**
- Reframed title from "Monitoring" to "Monitoring & Operating"
- Operating voice AI systems introduction (real-time performance, cost, quality)
- Tools at a glance (Boards, Insights API, Analytics API, Langfuse, Webhook-to-External)
- Placeholder sections for tool details, alerting strategies, best practices
- Focus on operational reliability and continuous visibility

**Optimization workflows guide:**
- Optimization as continuous improvement loop (not a dedicated tool)
- 7-step workflow (Detect → Extract → Hypothesize → Change → Test → Deploy → Verify)
- Optimization mindset and why it matters
- Placeholder sections for detailed steps, common scenarios, best practices
- Cross-functional workflow using tools from all previous stages

Both pages use skeleton format with complete intros and VAPI validation questions,
awaiting tool clarification and detailed content development in iteration 2.
---
 fern/observability/monitoring.mdx             | 149 +++++++++++++++
 fern/observability/optimization-workflows.mdx | 174 ++++++++++++++++++
 2 files changed, 323 insertions(+)
 create mode 100644 fern/observability/monitoring.mdx
 create mode 100644 fern/observability/optimization-workflows.mdx

diff --git a/fern/observability/monitoring.mdx b/fern/observability/monitoring.mdx
new file mode 100644
index 000000000..163a7ac57
--- /dev/null
+++ b/fern/observability/monitoring.mdx
@@ -0,0 +1,149 @@
+---
+title: Monitoring & Operating
+subtitle: Visualize trends, track operational health, and ensure production reliability
+slug: observability/monitoring
+---
+
+## What is monitoring and operating?
+
+**Monitoring & Operating** means running your voice AI system in production with continuous visibility into its health and performance. This stage answers critical operational questions:
+
+- How many calls are happening right now?
+- What's my average call cost this week?
+- Is my success rate dropping?
+- Are any assistants experiencing unusual error rates?
+- When should I be alerted about problems?
+
+**Operating a voice AI system** requires more than traditional software monitoring. Voice AI systems have unique operational characteristics:
+
+- **Real-time performance matters** — Latency, interruption handling, and voice quality directly impact user experience
+- **Cost scales with usage** — Every call has LLM, TTS, and STT costs that must be tracked
+- **Quality is subjective** — Success isn't just "200 OK" - it's whether the conversation achieved its goal
+- **Failures are multi-layered** — Issues can occur in the LLM, voice pipeline, tool execution, or external integrations
+
+**The goal**: Catch problems early (before customers complain), understand operational patterns, and maintain production reliability.
+
+---
+
+## Monitoring & Operating tools at a glance
+
+| Tool | What it does | Best for |
+|------|--------------|----------|
+| **Boards** | Drag-and-drop visual dashboards with charts, metrics, and global filters. Queries scalar Structured Output fields. | Real-time operational visibility, team dashboards, custom reporting |
+| **Insights API** | [TBD: Programmatic querying and alerting capabilities?] | [TBD: Automated alerts, custom monitoring logic?] |
+| **Analytics API** | [TBD: Aggregated operational metrics?] | [TBD: Cost tracking, performance monitoring?] |
+| **Langfuse Integration** | Real-time observability platform integration for call monitoring and tracing | End-to-end observability, LLM performance tracking, distributed tracing |
+| **Webhook-to-External** | Export call data to third-party monitoring platforms (Datadog, Braintrust, Grafana, custom dashboards) | Enterprise monitoring stacks, unified observability across systems, custom alerting |
+
+<span className="vapi-validation">Confirm this list of monitoring tools is complete and accurate. Need clarification on: What are the key capabilities and use cases for Insights API vs Analytics API? How do they differ? When should users choose one over the other? What monitoring capabilities does Langfuse provide beyond basic call data? Are there other built-in or recommended monitoring integrations? What's the roadmap for built-in alerting capabilities?</span>
+
+---
+
+## Boards
+
+**[Placeholder - Full detail section]**
+
+→ **[Build your first dashboard in Boards quickstart](/observability/boards-quickstart)**
+
+---
+
+## Analytics API
+
+**[Placeholder - Full detail section]**
+
+<span className="internal-note">What's the difference between Analytics API and Insights API? What are Analytics API's key capabilities? When should users choose Analytics API vs Insights API vs Boards?</span>
+
+---
+
+## Insights API
+
+**[Placeholder - Full detail section]**
+
+<Warning>
+  **Insights API is currently undocumented**. If you need flexible querying or programmatic alerting, contact Vapi support for guidance.
+</Warning>
+
+<span className="internal-note">Should Insights API be formally documented? What's the relationship between Insights API and Analytics API? Is Insights API the primary alerting mechanism, or are built-in alerts planned?</span>
+
+---
+
+## Langfuse Integration
+
+**[Placeholder - Full detail section]**
+
+<span className="vapi-validation">What are Langfuse's key capabilities for Vapi users? Does it provide real-time alerting? What metrics/traces does it capture? Are there setup requirements or limitations?</span>
+
+---
+
+## Webhook-to-External Monitoring
+
+**[Placeholder - Full detail section]**
+
+<span className="vapi-validation">What are recommended third-party monitoring platforms for Vapi (Datadog, Braintrust, etc.)? Are there integration guides or examples? What webhook events are most useful for monitoring?</span>
+
+---
+
+## Alerting Strategies
+
+**[Placeholder - Full detail section]**
+
+<span className="internal-note">Are built-in alerts on the roadmap?</span>
+
+---
+
+## Monitoring Best Practices
+
+**[Placeholder - Full detail section]**
+
+Topics to cover:
+- Define baseline metrics
+- Set alert thresholds (critical, warning, informational)
+- Monitor continuously, not reactively
+- Create role-specific dashboards
+
+---
+
+## What you'll learn in detailed guides
+
+- [Boards quickstart](/observability/boards-quickstart) — Create custom dashboards in minutes
+- (Planned) Langfuse integration guide — Set up real-time observability
+- (Planned) Webhook monitoring guide — Export to external platforms
+- (Planned) Analytics API reference — Programmatic monitoring
+
+---
+
+## Key takeaway
+
+**Monitor continuously**. Production issues caught early (via dashboards or alerts) are easier to fix than issues discovered through customer complaints.
+
+Operating a voice AI system requires proactive monitoring. Set up visibility on day one of production launch.
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Boards quickstart"
+    icon="chart-line"
+    href="/observability/boards-quickstart"
+  >
+    Build your first monitoring dashboard
+  </Card>
+
+  <Card
+    title="Optimization workflows"
+    icon="arrow-trend-up"
+    href="/observability/optimization-workflows"
+  >
+    Next stage: Use monitoring data to improve
+  </Card>
+
+  <Card
+    title="Back to overview"
+    icon="arrow-left"
+    href="/observability/framework"
+  >
+    Return to observability framework
+  </Card>
+</CardGroup>
diff --git a/fern/observability/optimization-workflows.mdx b/fern/observability/optimization-workflows.mdx
new file mode 100644
index 000000000..063fc624a
--- /dev/null
+++ b/fern/observability/optimization-workflows.mdx
@@ -0,0 +1,174 @@
+---
+title: Optimization workflows
+subtitle: Use observability data to continuously improve your assistant
+slug: observability/optimization-workflows
+---
+
+## What is optimization?
+
+**Optimization** is the continuous improvement loop: using observability data to refine prompts, improve tool calls, and enhance conversation flows.
+
+Unlike the previous stages (INSTRUMENT, TEST, EXTRACT, MONITOR), **OPTIMIZE is not a dedicated tool or feature** — it's a workflow that combines tools from all previous stages to drive systematic improvement.
+
+**The optimization mindset**: Voice AI quality improves through iteration, not perfection. The best teams:
+- Start with "good enough" (not perfect)
+- Deploy to production with instrumentation and monitoring
+- Use real-world data to identify improvement opportunities
+- Test changes before deploying
+- Track impact systematically
+
+**Why optimization matters**: Without a systematic optimization workflow, teams either:
+- ❌ Over-engineer before launch (trying to predict every edge case)
+- ❌ React to problems ad-hoc (fixing symptoms, not root causes)
+- ❌ Stagnate after launch (no process for continuous improvement)
+
+**The goal**: Establish a repeatable workflow that turns observability data into measurable improvements.
+
+---
+
+## Optimization workflow at a glance
+
+| Stage | Tools Used | What you do |
+|-------|-----------|-------------|
+| **1. Detect patterns** | Boards, Insights API, Analytics API | Spot trends in monitoring dashboards (success rate dropping, cost increasing, etc.) |
+| **2. Extract details** | Webhooks, Structured Outputs, Transcripts | Pull call data to understand WHY the pattern exists |
+| **3. Form hypothesis** | Manual analysis | Identify root cause (e.g., "prompt doesn't handle edge case X") |
+| **4. Make changes** | Assistant configuration | Update prompts, tools, routing logic based on hypothesis |
+| **5. Test changes** | Evals, Simulations | Validate improvement before deploying to production |
+| **6. Deploy** | API, Dashboard | Push updated assistant to production |
+| **7. Verify** | Boards, Insights API | Track target metric to confirm improvement |
+
+This is a **continuous cycle**, not a one-time activity:
+
+```
+MONITOR → EXTRACT → Analyze → Revise → TEST → Deploy → MONITOR (repeat)
+```
+
+<span className="vapi-validation">Confirm this optimization workflow accurately reflects how Vapi customers typically iterate on their assistants. Are there tools or stages we're missing? Should we emphasize certain steps more than others?</span>
+
+---
+
+## The optimization loop in detail
+
+**[Placeholder - Full detail sections]**
+
+### Step 1: Detect patterns from monitoring
+
+<span className="internal-note">Placeholder for: How to use Boards/analytics to spot trends (success rate drops, cost spikes, etc.). Include example scenario.</span>
+
+---
+
+### Step 2: Extract detailed data
+
+<span className="internal-note">Placeholder for: Methods for pulling call transcripts, structured outputs, tool call logs. Show how to filter/export data for analysis.</span>
+
+---
+
+### Step 3: Form a hypothesis
+
+<span className="internal-note">Placeholder for: Common hypothesis patterns (prompt issues, tool description problems, routing logic, verbosity, etc.). Show example hypothesis formation process.</span>
+
+---
+
+### Step 4: Make targeted changes
+
+<span className="internal-note">Placeholder for: How to revise prompts, update tool descriptions, refine conversation flows. Include before/after examples.</span>
+
+---
+
+### Step 5: Test before deploying
+
+<span className="internal-note">Placeholder for: Creating Evals for specific failure cases, regression testing strategies. Show example test structure.</span>
+
+---
+
+### Step 6: Deploy
+
+<span className="internal-note">Placeholder for: Deployment strategies (direct deploy, staged rollout, A/B testing). Include decision framework for choosing strategy.</span>
+
+---
+
+### Step 7: Verify improvement
+
+<span className="internal-note">Placeholder for: Time windows for verification (immediate, 24h, 1 week), what to track, when to roll back.</span>
+
+---
+
+## Common optimization scenarios
+
+**[Placeholder - Table of common patterns, root causes, and optimization actions]**
+
+<span className="vapi-validation">What are the most common optimization scenarios Vapi customers encounter? What issues drive the most improvement iterations? Are there voice-specific optimization patterns we should highlight?</span>
+
+---
+
+## Optimization best practices
+
+**[Placeholder - Full detail sections]**
+
+Topics to cover:
+- Start with high-impact, low-effort changes
+- Track improvement over time (optimization log)
+- Don't optimize prematurely (wait for data)
+- Make one change at a time (clear cause-and-effect)
+- Maintain regression tests
+
+<span className="internal-note">Should we include specific guidance on optimization cadence (weekly reviews, monthly deep dives, quarterly retrospectives)?</span>
+
+---
+
+## What you'll learn in detailed guides
+
+**Optimization is cross-functional** — it references tools from all previous stages:
+- [Evals quickstart](/observability/evals-quickstart) — Test improvements before deploying
+- [Boards quickstart](/observability/boards-quickstart) — Track metrics over time
+- [Structured outputs quickstart](/assistants/structured-outputs-quickstart) — Extract failure data for analysis
+
+(Planned) Optimization playbook — Common scenarios and solutions
+(Planned) Advanced optimization — A/B testing, staged rollouts, impact measurement
+
+---
+
+## Key takeaway
+
+**Optimize continuously**. The best teams treat observability as a loop: instrument → test → deploy → monitor → identify improvements → repeat. Data-driven iteration beats guesswork.
+
+Start your optimization practice on day one. Don't wait until you have problems — establish the workflow while things are working, so you're ready when issues arise.
+
+---
+
+## Next steps
+
+<CardGroup cols={2}>
+  <Card
+    title="Boards quickstart"
+    icon="chart-line"
+    href="/observability/boards-quickstart"
+  >
+    Set up monitoring to detect patterns
+  </Card>
+
+  <Card
+    title="Evals quickstart"
+    icon="clipboard-check"
+    href="/observability/evals-quickstart"
+  >
+    Build tests to validate improvements
+  </Card>
+
+  <Card
+    title="Production readiness"
+    icon="check-circle"
+    href="/observability/production-readiness"
+  >
+    Validate you're ready to optimize in production
+  </Card>
+
+  <Card
+    title="Back to overview"
+    icon="arrow-left"
+    href="/observability/framework"
+  >
+    Return to observability framework
+  </Card>
+</CardGroup>

From 98eb623ce961dc46cbb93d5151d6a357e8fee635 Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 12:52:21 -0500
Subject: [PATCH 7/8] docs(chore): add draft status indicators to observability
 guides

Add internal-note banners indicating completion status for VAPI reviewers:

Rough Draft (3 pages - content present, needs refinement):
- observability-framework.mdx
- instrumentation.mdx
- testing-strategies.mdx

Skeleton Draft (3 pages - structure only, detailed content pending):
- production-readiness.mdx (iteration 3)
- monitoring.mdx (iteration 2)
- optimization-workflows.mdx (iteration 2)

This helps reviewers calibrate expectations for which pages are ready
for content review vs. structural/architectural review only.
---
 fern/observability/instrumentation.mdx         | 2 ++
 fern/observability/monitoring.mdx              | 2 ++
 fern/observability/observability-framework.mdx | 2 ++
 fern/observability/optimization-workflows.mdx  | 2 ++
 fern/observability/production-readiness.mdx    | 4 +++-
 fern/observability/testing-strategies.mdx      | 2 ++
 6 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/fern/observability/instrumentation.mdx b/fern/observability/instrumentation.mdx
index 9f7863ca2..73be9ea2f 100644
--- a/fern/observability/instrumentation.mdx
+++ b/fern/observability/instrumentation.mdx
@@ -4,6 +4,8 @@ subtitle: Configure your assistant to capture operational and business metrics
 slug: observability/instrumentation
 ---
 
+<span className="internal-note">This page is in Rough Draft stage</span>
+
 ## What is instrumentation?
 
 **Instrumentation** means configuring your assistant to capture data during and after calls. Before you can test, monitor, or optimize, you need to define **what data to capture**.
diff --git a/fern/observability/monitoring.mdx b/fern/observability/monitoring.mdx
index 163a7ac57..405da1e54 100644
--- a/fern/observability/monitoring.mdx
+++ b/fern/observability/monitoring.mdx
@@ -4,6 +4,8 @@ subtitle: Visualize trends, track operational health, and ensure production reli
 slug: observability/monitoring
 ---
 
+<span className="internal-note">This page is in Skeleton Draft stage - structure and scope for review, detailed content to be developed in iteration 2</span>
+
 ## What is monitoring and operating?
 
 **Monitoring & Operating** means running your voice AI system in production with continuous visibility into its health and performance. This stage answers critical operational questions:
diff --git a/fern/observability/observability-framework.mdx b/fern/observability/observability-framework.mdx
index aa6a4c4d4..c2be9a41b 100644
--- a/fern/observability/observability-framework.mdx
+++ b/fern/observability/observability-framework.mdx
@@ -4,6 +4,8 @@ subtitle: A systematic framework for building, testing, and improving voice AI a
 slug: observability/framework
 ---
 
+<span className="internal-note">This page is in Rough Draft stage</span>
+
 ## What is observability for voice AI?
 
 Observability for voice AI means **instrumenting your assistants to capture data**, **testing them before production**, **extracting insights from calls**, **monitoring operational health**, and **using that data to continuously improve**.
diff --git a/fern/observability/optimization-workflows.mdx b/fern/observability/optimization-workflows.mdx
index 063fc624a..775a94a5d 100644
--- a/fern/observability/optimization-workflows.mdx
+++ b/fern/observability/optimization-workflows.mdx
@@ -4,6 +4,8 @@ subtitle: Use observability data to continuously improve your assistant
 slug: observability/optimization-workflows
 ---
 
+<span className="internal-note">This page is in Skeleton Draft stage - structure and scope for review, detailed content to be developed in iteration 2</span>
+
 ## What is optimization?
 
 **Optimization** is the continuous improvement loop: using observability data to refine prompts, improve tool calls, and enhance conversation flows.
diff --git a/fern/observability/production-readiness.mdx b/fern/observability/production-readiness.mdx
index 659d03197..a79fbf649 100644
--- a/fern/observability/production-readiness.mdx
+++ b/fern/observability/production-readiness.mdx
@@ -4,6 +4,8 @@ subtitle: Validate your voice AI assistant is ready for production deployment
 slug: observability/production-readiness
 ---
 
+<span className="internal-note">This page is in Skeleton Draft stage - structure and scope for review, detailed content to be developed in iteration 3</span>
+
 ## What is production readiness?
 
 **Production readiness** means your voice assistant has been systematically validated across instrumentation, testing, extraction, and monitoring **before** you deploy it to handle real customer calls.
@@ -16,7 +18,7 @@ Deploying without production readiness means:
 
 **This checklist helps you avoid those problems** by ensuring you've instrumented, tested, and validated before launch.
 
-<span className="internal-note"> Does VAPI have internal prod readiness criteria we should align with?</span>
+<span className="vapi-validation"> Does VAPI have internal prod readiness criteria we should align with?</span>
 
 ---
 
diff --git a/fern/observability/testing-strategies.mdx b/fern/observability/testing-strategies.mdx
index 101de212f..2b9c16fcf 100644
--- a/fern/observability/testing-strategies.mdx
+++ b/fern/observability/testing-strategies.mdx
@@ -4,6 +4,8 @@ subtitle: Validate your assistant works correctly before deploying to production
 slug: observability/testing-strategies
 ---
 
+<span className="internal-note">This page is in Rough Draft stage</span>
+
 ## Voice AI Testing Challenges
 
 **Testing** means validating your assistant works correctly **before deploying to production**. Voice AI testing prevents embarrassing failures, reduces production debugging costs, and builds confidence in your assistant.

From 3d40c496ae00faa7e0c6ad3de8a47ac9bf887ff4 Mon Sep 17 00:00:00 2001
From: Rafa <rafa@workingmans.ai>
Date: Mon, 16 Feb 2026 14:11:00 -0500
Subject: [PATCH 8/8] observability(docs): adopt phase terminology across
 framework

Changed "stage" to "phase" throughout observability framework to better
reflect the non-linear, iterative nature of the model. Phases can be
revisited and worked on concurrently, unlike sequential stages.

Changes:
- Framework page: Updated all section headings from "Stage X:" to "Phase"
  format, removed numbering from navigation cards, updated prose
- All 5 phase guides: Added phase context to subtitle frontmatter
  (e.g., "This is the INSTRUMENT phase of the observability framework")
- Removed numbered stage references throughout

Also includes from earlier consistency review:
- Framework: Added Test Suites deprecated label, Simulations pre-release
- Instrumentation: Removed Call Analysis recommendation, reordered nav cards,
  added back-link, added inter-stage bridge, removed decorative emoji
- Testing strategies: Added prerequisite reference to instrumentation
- Extraction patterns: Removed decorative emojis from comparison table
---
 fern/observability/extraction-patterns.mdx    | 10 +++---
 fern/observability/instrumentation.mdx        | 32 +++++++++++------
 fern/observability/monitoring.mdx             |  2 +-
 .../observability/observability-framework.mdx | 34 +++++++++----------
 fern/observability/optimization-workflows.mdx |  3 +-
 fern/observability/testing-strategies.mdx     |  4 ++-
 6 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/fern/observability/extraction-patterns.mdx b/fern/observability/extraction-patterns.mdx
index 9df861e8e..8fcffebda 100644
--- a/fern/observability/extraction-patterns.mdx
+++ b/fern/observability/extraction-patterns.mdx
@@ -1,9 +1,11 @@
 ---
 title: Choosing your extraction pattern
-subtitle: Understand the three architectural patterns for getting data out of Vapi
+subtitle: Understand the three architectural patterns for getting data out of Vapi. This is the **EXTRACT phase** of the [observability framework](/observability/framework).
 slug: observability/extraction-patterns
 ---
 
+<span className="internal-note">This page is in Rough Draft stage</span>
+
 ## Why extraction is an architectural choice
 
 Unlike traditional observability platforms (DataDog, New Relic) where data flows automatically from instrumentation to monitoring, **Vapi requires you to choose how data gets extracted** for analysis.
@@ -31,9 +33,9 @@ Vapi offers three architectural patterns for extracting observability data from
 
 | Pattern | Description | Engineering effort | Data richness | Typical users |
 |---------|-------------|-------------------|---------------|---------------|
-| **Dashboard Native** | Use Vapi's built-in Boards with scalar Structured Outputs for real-time dashboards | ⚡ Minimal (no infrastructure) | Basic (scalar fields only) | Solo founders, non-technical teams, startups |
-| **Webhook-to-External** | Build custom post-call processing that captures data via webhooks and exports to your data warehouse | 🛠️ High (requires backend infrastructure) | Rich (full object schemas, nested data) | Engineering teams, enterprises with existing data platforms |
-| **Hybrid** | Combine both approaches - use Boards for operational metrics, webhooks for deep analysis | ⚙️ Medium (partial infrastructure) | Flexible (mix of scalar and object data) | Growing teams balancing simplicity and power |
+| **Dashboard Native** | Use Vapi's built-in Boards with scalar Structured Outputs for real-time dashboards | Minimal (no infrastructure) | Basic (scalar fields only) | Solo founders, non-technical teams, startups |
+| **Webhook-to-External** | Build custom post-call processing that captures data via webhooks and exports to your data warehouse | High (requires backend infrastructure) | Rich (full object schemas, nested data) | Engineering teams, enterprises with existing data platforms |
+| **Hybrid** | Combine both approaches - use Boards for operational metrics, webhooks for deep analysis | Medium (partial infrastructure) | Flexible (mix of scalar and object data) | Growing teams balancing simplicity and power |
 
 **How to choose**: Start with Dashboard Native (fastest setup). Migrate to Hybrid or Webhook-to-External as your analytics needs grow or when you need features like Scorecard visualization or external BI tools.
 
diff --git a/fern/observability/instrumentation.mdx b/fern/observability/instrumentation.mdx
index 73be9ea2f..65d2bda9c 100644
--- a/fern/observability/instrumentation.mdx
+++ b/fern/observability/instrumentation.mdx
@@ -1,6 +1,6 @@
 ---
 title: Instrumentation
-subtitle: Configure your assistant to capture operational and business metrics
+subtitle: Configure your assistant to capture operational and business metrics. This is the **INSTRUMENT phase** of the [observability framework](/observability/framework).
 slug: observability/instrumentation
 ---
 
@@ -39,6 +39,8 @@ Think of instrumentation as installing sensors in your assistant:
 - What metrics will help you debug failures?
 - What data do you need for compliance or reporting?
 
+The schemas you define here become the assertions your Evals validate in the [TEST stage](/observability/testing-strategies).
+
 The "Instrumentation tools at a glance" section below shows how to configure custom instrumentation.
 
 ---
@@ -48,7 +50,7 @@ The "Instrumentation tools at a glance" section below shows how to configure cus
 | Tool                         | What it does                                                                                                                         | Configuration                                              |
 | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------- |
 | **Built-in Instrumentation** | Automatic capture of call metadata (duration, cost, timestamps), transcripts, messages, tool calls, operational metrics.             | ✅ Automatic - no configuration needed                     |
-| **Structured Outputs**       | AI-powered data extraction using JSON Schema. Define custom schemas for customer info, call outcomes, sentiment analysis, summaries. | ⚙️ Configure schemas on assistant                          |
+| **Structured Outputs**       | AI-powered data extraction using JSON Schema. Define custom schemas for customer info, call outcomes, sentiment analysis, summaries. | Configure schemas on assistant                          |
 | **Call Analysis**            | Legacy feature for generating call summaries using AnalysisPlan configuration.                                                       | ⚠️ Legacy (use Structured Outputs for new implementations) |
 
 
@@ -134,7 +136,7 @@ Built-in does NOT cover:
 
 **When NOT to use**:
 
-- You only need simple call summaries (consider Call Analysis)
+- You only need simple call summaries (Structured Outputs can generate simple summaries too, but may be overkill if you don't need structured data)
 - Built-in operational metrics are sufficient
 
 → **[Configure Structured Outputs: Quickstart](/assistants/structured-outputs-quickstart)**
@@ -275,14 +277,6 @@ Start with basic business metrics (call success, customer info), then add qualit
     Set up your first custom instrumentation
   </Card>
 
-<Card
-  title="Extraction patterns"
-  icon="diagram-project"
-  href="/observability/extraction-patterns"
->
-  Choose your data extraction strategy
-</Card>
-
   <Card
     title="Testing strategies"
     icon="vial"
@@ -290,4 +284,20 @@ Start with basic business metrics (call success, customer info), then add qualit
   >
     Next stage: Validate your instrumented assistant
   </Card>
+
+  <Card
+    title="Extraction patterns"
+    icon="diagram-project"
+    href="/observability/extraction-patterns"
+  >
+    Choose your data extraction strategy
+  </Card>
+
+  <Card
+    title="Back to framework"
+    icon="arrow-left"
+    href="/observability/framework"
+  >
+    Return to the observability maturity model
+  </Card>
 </CardGroup>
diff --git a/fern/observability/monitoring.mdx b/fern/observability/monitoring.mdx
index 405da1e54..e1ea46d43 100644
--- a/fern/observability/monitoring.mdx
+++ b/fern/observability/monitoring.mdx
@@ -1,6 +1,6 @@
 ---
 title: Monitoring & Operating
-subtitle: Visualize trends, track operational health, and ensure production reliability
+subtitle: Visualize trends, track operational health, and ensure production reliability. This is the **MONITOR phase** of the [observability framework](/observability/framework).
 slug: observability/monitoring
 ---
 
diff --git a/fern/observability/observability-framework.mdx b/fern/observability/observability-framework.mdx
index c2be9a41b..c615a1e13 100644
--- a/fern/observability/observability-framework.mdx
+++ b/fern/observability/observability-framework.mdx
@@ -40,7 +40,7 @@ If you're just experimenting or building a demo, you might not need the full fra
 
 ## The observability maturity model
 
-Vapi's observability tools support a 5-stage progression:
+Vapi's observability tools support a 5-phase progression:
 
 ```
 ┌──────────────────────────────────────────────────────────────────┐
@@ -59,7 +59,7 @@ Vapi's observability tools support a 5-stage progression:
 
 ### This is a maturity progression, not a linear checklist
 
-You don't complete one stage and never return to it. Observability is **continuous**:
+You don't complete one phase and never return to it. Observability is **continuous**:
 
 - **Instrument** as you build new features
 - **Test** after every change
@@ -69,17 +69,17 @@ You don't complete one stage and never return to it. Observability is **continuo
 
 **For teams just starting**: Begin with INSTRUMENT + TEST (validate before production). Add EXTRACT + MONITOR as you scale. OPTIMIZE becomes natural once you have data flowing.
 
-**For experienced teams**: You're likely already monitoring production. This framework helps systematize pre-production testing (TEST stage) and formalize continuous improvement (OPTIMIZE stage).
+**For experienced teams**: You're likely already monitoring production. This framework helps systematize pre-production testing (TEST phase) and formalize continuous improvement (OPTIMIZE phase).
 
-<span className="vapi-validation">Is "maturity model" the right framing? Should we emphasize iteration more explicitly? How do customer segments (startups vs enterprises) typically progress through these stages?</span>
+<span className="vapi-validation">Is "maturity model" the right framing? Should we emphasize iteration more explicitly? How do customer segments (startups vs enterprises) typically progress through these phases?</span>
 
 ---
 
 ## How this framework maps to Vapi tools
 
-Each stage uses specific Vapi features. Here's a quick reference:
+Each phase uses specific Vapi features. Here's a quick reference:
 
-### Stage 1: INSTRUMENT
+### INSTRUMENT Phase
 
 Configure your assistant to capture operational and business metrics.
 
@@ -89,17 +89,17 @@ Configure your assistant to capture operational and business metrics.
 
 ---
 
-### Stage 2: TEST
+### TEST Phase
 
 Validate your assistant works correctly before production deployment.
 
-**What you'll use**: Evals, Simulations, Test Suites
+**What you'll use**: Evals, Simulations (Pre-release), Test Suites (⚠️ Deprecated)
 
 → **[Deep dive: Testing strategies](/observability/testing-strategies)**
 
 ---
 
-### Stage 3: EXTRACT
+### EXTRACT Phase
 
 Choose your data extraction pattern based on technical capability and analytics needs.
 
@@ -109,7 +109,7 @@ Choose your data extraction pattern based on technical capability and analytics
 
 ---
 
-### Stage 4: MONITOR
+### MONITOR Phase
 
 Visualize trends, track operational health, and catch problems early.
 
@@ -119,7 +119,7 @@ Visualize trends, track operational health, and catch problems early.
 
 ---
 
-### Stage 5: OPTIMIZE
+### OPTIMIZE Phase
 
 Use observability data to continuously improve your assistant.
 
@@ -167,7 +167,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
 
 ## Next steps
 
-### Learn the framework stages
+### Learn the framework phases
 
 <CardGroup cols={2}>
   <Card
@@ -175,7 +175,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
     icon="wrench"
     href="/observability/instrumentation"
   >
-    Stage 1: Configure data capture
+    Configure data capture
   </Card>
 
   <Card
@@ -183,7 +183,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
     icon="vial"
     href="/observability/testing-strategies"
   >
-    Stage 2: Validate before production
+    Validate before production
   </Card>
 
   <Card
@@ -191,7 +191,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
     icon="diagram-project"
     href="/observability/extraction-patterns"
   >
-    Stage 3: Choose your data pipeline
+    Choose your data pipeline
   </Card>
 
   <Card
@@ -199,7 +199,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
     icon="chart-line"
     href="/observability/monitoring"
   >
-    Stage 4: Track operational health
+    Track operational health
   </Card>
 
   <Card
@@ -207,7 +207,7 @@ Most teams start with Dashboard Native (simple, no engineering required), add we
     icon="arrow-trend-up"
     href="/observability/optimization-workflows"
   >
-    Stage 5: Continuously improve
+    Continuously improve
   </Card>
 </CardGroup>
 
diff --git a/fern/observability/optimization-workflows.mdx b/fern/observability/optimization-workflows.mdx
index 775a94a5d..536935ebb 100644
--- a/fern/observability/optimization-workflows.mdx
+++ b/fern/observability/optimization-workflows.mdx
@@ -1,11 +1,12 @@
 ---
 title: Optimization workflows
-subtitle: Use observability data to continuously improve your assistant
+subtitle: Use observability data to continuously improve your assistant.  This is the **OPTIMIZE phase** of the [observability framework](/observability/framework).
 slug: observability/optimization-workflows
 ---
 
 <span className="internal-note">This page is in Skeleton Draft stage - structure and scope for review, detailed content to be developed in iteration 2</span>
 
+
 ## What is optimization?
 
 **Optimization** is the continuous improvement loop: using observability data to refine prompts, improve tool calls, and enhance conversation flows.
diff --git a/fern/observability/testing-strategies.mdx b/fern/observability/testing-strategies.mdx
index 2b9c16fcf..3c475c1c7 100644
--- a/fern/observability/testing-strategies.mdx
+++ b/fern/observability/testing-strategies.mdx
@@ -1,6 +1,6 @@
 ---
 title: Testing strategies
-subtitle: Validate your assistant works correctly before deploying to production
+subtitle: Validate your assistant works correctly before deploying to production. This is the **TEST phase** of the [observability framework](/observability/framework).
 slug: observability/testing-strategies
 ---
 
@@ -18,6 +18,8 @@ Unlike traditional software testing (unit tests, integration tests), voice AI te
 - **Edge cases** — How does the system handle interruptions, unclear requests, or unexpected inputs?
 - **Regression** — Do changes break existing functionality?
 
+Testing assumes you've already instrumented your assistant with Structured Outputs (see [Instrumentation](/observability/instrumentation)).
+
 <span className="vapi-validation">What other specific validation and/or testing uniqueness have clients reported when working with voice AI testing?</span>
 
 ---