diff --git a/Cargo.lock b/Cargo.lock
index 509e14e5f..873c64925 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1109,8 +1109,12 @@ dependencies = [
"admin-client",
"amp-object-store",
"anyhow",
+ "async-trait",
"clap",
+ "common",
"console",
+ "datafusion",
+ "dataset-authoring",
"datasets-common",
"datasets-derived",
"dump",
@@ -4068,6 +4072,31 @@ dependencies = [
"unicode-width",
]
+[[package]]
+name = "dataset-authoring"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "common",
+ "datafusion",
+ "datasets-common",
+ "datasets-derived",
+ "dirs",
+ "flate2",
+ "minijinja",
+ "regex",
+ "semver 1.0.27",
+ "serde",
+ "serde_json",
+ "serde_json_canonicalizer 0.2.0",
+ "serde_yaml",
+ "tar",
+ "tempfile",
+ "thiserror 2.0.18",
+ "tokio",
+ "walkdir",
+]
+
[[package]]
name = "datasets-common"
version = "0.1.0"
@@ -4898,6 +4927,17 @@ dependencies = [
"winapi",
]
+[[package]]
+name = "filetime"
+version = "0.2.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "libredox",
+]
+
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
@@ -6764,6 +6804,15 @@ version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+[[package]]
+name = "minijinja"
+version = "2.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b479616bb6f0779fb0f3964246beda02d4b01144e1b0d5519616e012ccc2a245"
+dependencies = [
+ "serde",
+]
+
[[package]]
name = "minimal-lexical"
version = "0.2.1"
@@ -8033,7 +8082,7 @@ dependencies = [
"reqwest 0.12.28",
"serde",
"serde_json",
- "serde_json_canonicalizer",
+ "serde_json_canonicalizer 0.3.1",
"serde_yaml",
"sha2",
"spki",
@@ -9081,6 +9130,12 @@ version = "1.0.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984"
+[[package]]
+name = "ryu-js"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6518fc26bced4d53678a22d6e423e9d8716377def84545fe328236e3af070e7f"
+
[[package]]
name = "ryu-js"
version = "1.0.2"
@@ -9369,13 +9424,24 @@ dependencies = [
"zmij",
]
+[[package]]
+name = "serde_json_canonicalizer"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8a4c2ea923e1d3d2c03bd1299b7061e015f3bd9dab5a47b34283df9d8ab36a1"
+dependencies = [
+ "ryu-js 0.2.2",
+ "serde",
+ "serde_json",
+]
+
[[package]]
name = "serde_json_canonicalizer"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f777f77aeef456e47e75c2a4b16804b15395be5b344e2094a54965143ef1c31"
dependencies = [
- "ryu-js",
+ "ryu-js 1.0.2",
"serde",
"serde_json",
]
@@ -12161,6 +12227,17 @@ version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+[[package]]
+name = "tar"
+version = "0.4.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+
[[package]]
name = "tempfile"
version = "3.24.0"
@@ -12260,12 +12337,15 @@ dependencies = [
"amp-providers-registry",
"ampctl",
"arrow-flight",
+ "async-trait",
"backon",
"clap",
"color-backtrace",
"common",
"controller",
"ctor",
+ "datafusion",
+ "dataset-authoring",
"datasets-common",
"datasets-derived",
"dotenvy",
@@ -14090,6 +14170,16 @@ dependencies = [
"time",
]
+[[package]]
+name = "xattr"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
+dependencies = [
+ "libc",
+ "rustix 1.1.3",
+]
+
[[package]]
name = "yansi"
version = "1.0.1"
diff --git a/Cargo.toml b/Cargo.toml
index 0c68b4ade..fd39271ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -11,6 +11,7 @@ members = [
"crates/clients/flight",
"crates/core/common",
"crates/core/data-store",
+ "crates/core/dataset-authoring",
"crates/core/dataset-store",
"crates/core/datasets-registry",
"crates/core/providers-registry",
@@ -95,6 +96,8 @@ governor = "0.10.0"
hex = "0.4.3"
indoc = "2.0.6"
itertools = "0.14.0"
+minijinja = "2.7"
+serde_json_canonicalizer = "0.2"
opentelemetry = { version = "0.31", features = ["trace"] }
opentelemetry-otlp = { version = "0.31", features = [
"grpc-tonic",
@@ -140,12 +143,16 @@ uuid = { version = "1.11.0", features = ["v7"] }
nix = { version = "0.30.1", default-features = false, features = ["signal"] }
percent-encoding = "2.3"
which = "8.0.0"
+walkdir = "2"
zstd = "0.13.3"
+tar = "0.4"
+flate2 = "1.1"
# Datafusion and Arrow crates
arrow = "57"
arrow-flight = { version = "57", features = ["flight-sql-experimental"] }
datafusion = { version = "52", features = ["serde"] }
+dirs = "6.0"
datafusion-tracing = { version = "52" }
datafusion-datasource = { version = "52" }
object_store = { version = "0.12", features = ["aws", "gcp", "azure"] }
diff --git a/SPEC.md b/SPEC.md
new file mode 100644
index 000000000..326afdb0f
--- /dev/null
+++ b/SPEC.md
@@ -0,0 +1,367 @@
+# Dataset Authoring - IPC Schema + `tables/` Refactor Plan
+
+## Summary
+
+Refactor dataset-authoring to use Arrow IPC **file format** for schemas and move build artifacts under `tables/`. Rename the authoring config field from `models` to `tables`. Keep Amp core unchanged by converting **legacy manifests → canonical package** in the dataset-authoring adapter layer when fetching from the admin API, and converting **package → legacy manifest JSON** in memory for `ampctl dataset register --package`. No backwards compatibility with old `sql/` or `*.schema.json` outputs.
+
+## Decisions
+
+- **Schema format**: Arrow IPC **file format** (`.ipc`), not JSON.
+- **Build output layout**:
+ - `tables/
.sql` for derived datasets only
+ - `tables/.ipc` for all tables
+ - `functions/.js` unchanged
+- **Authoring config**: rename `models` → `tables` (default `tables`).
+- **Manifest table shape**:
+ - Derived table: `tables..sql` + `tables..ipc` + `network`
+ - Raw table: `tables..ipc` + `network` (no `sql` field)
+- **Cache**: `~/.amp/registry//` stores canonical package format.
+- **Interop**:
+ - **Admin API fetch**: legacy manifest JSON → canonical package (adapter).
+ - **Register**: package → legacy manifest JSON in memory (adapter).
+- **No backwards compatibility** with old `sql/` + `*.schema.json` outputs.
+
+---
+
+## Status
+
+| Phase | Description | Status |
+|-------|-------------|--------|
+| 1 | Arrow IPC Module | **Complete** |
+| 2 | Rename `models` → `tables` | **Complete** |
+| 3 | Build Output Layout (`sql/` → `tables/`) | **Complete** |
+| 4 | Schema Type Refactor | **Complete** |
+| 5 | Manifest Table Shape Changes | **Complete** |
+| 6 | Adapter Layer (Legacy ↔ Package) | **Complete** |
+| 7 | Cache Updates | **Complete** |
+| 8 | Documentation & Tests | **Complete** |
+
+---
+
+## Gap Analysis
+
+Based on codebase exploration (2026-02-04, verified via code search):
+
+### Currently Implemented
+
+| Component | Location | Details |
+|-----------|----------|---------|
+| Config parsing | `config.rs:87-118` | `AmpYaml` with `models: PathBuf` field, default via `default_models_dir()` returning `"models"` |
+| Model discovery | `discovery.rs:97-162` | `discover_models()` scans `/**/*.sql`, returns `BTreeMap` |
+| Build output | `manifest.rs:406,415` | `sql/.sql` and `sql/.schema.json` |
+| Schema files | `arrow_json.rs` | JSON format using `ArrowSchema` from `datasets_common::manifest` |
+| Package assembly | `package.rs:175-184` | Includes `sql/` and `functions/` directories |
+| Cache | `cache.rs:131-133` | Stores `/manifest.json` only (no SQL/schema files) |
+| Bridge | `bridge.rs` | Converts `AuthoringManifest` → legacy runtime format |
+| Jinja | `jinja.rs` | `ref`, `source`, `var`, `env_var`, `this` template helpers |
+| SQL validation | `query.rs` | SELECT-only, incremental mode constraints |
+| Lockfile | `lockfile.rs` | `amp.lock` for reproducible dependency resolution |
+| Validation | `validation.rs:105` | `discovered_models: BTreeMap` |
+| TableDef | `manifest.rs:168-176` | Has `sql: FileRef`, `schema: FileRef`, `network: NetworkId` |
+| Playground | `playground/` | Uses `models/` dir, builds to `build/sql/` |
+
+### Not Yet Implemented
+
+| Feature | Current State | Target State |
+|---------|---------------|--------------|
+| Arrow IPC I/O | `arrow_ipc.rs` module with `write_ipc_schema()`, `read_ipc_schema()` | **Complete** |
+| Build output dir | `sql/` | `tables/` |
+| Schema format | `.schema.json` (JSON) | `.ipc` (Arrow IPC file) |
+| Config field | `models:` (default `"models"`) | `tables:` (default `"tables"`) |
+| Raw table support | `sql` field required | `sql: Option` |
+| Cache format | `manifest.json` only | Full package: `manifest.json` + `tables/` + `functions/` |
+| Fetch adapter | N/A | Legacy manifest → canonical package conversion |
+| Register adapter | N/A | Package → legacy manifest JSON for API |
+
+---
+
+## Tasks
+
+### Phase 1: Arrow IPC Module (Foundational)
+
+**Files**: `arrow_ipc.rs` (new), `lib.rs` (1 line)
+
+**1.1) Create `arrow_ipc.rs` module**
+- [x] Add new module `crates/core/dataset-authoring/src/arrow_ipc.rs`
+- [x] Implement `write_ipc_schema(schema: &SchemaRef, path: &Path) -> Result<()>`
+ - Uses Arrow IPC FileWriter with schema-only (no record batches)
+- [x] Implement `read_ipc_schema(path: &Path) -> Result`
+ - Uses Arrow IPC FileReader to read schema metadata
+- [x] Add comprehensive tests for round-trip serialization
+- [x] Export from `lib.rs`
+
+**Acceptance criteria**: Can write Arrow `Schema` to `.ipc` file and read it back losslessly. **VERIFIED**
+
+---
+
+### Phase 2: Rename `models` → `tables` (Config Change)
+
+**Files**: `config.rs`, `discovery.rs`, `validation.rs`, CLI commands, integration tests
+
+**2.1) Update `config.rs`**
+- [x] Rename field `models: PathBuf` to `tables: PathBuf` (line 109 in `AmpYaml`)
+- [x] Update default function `default_models_dir()` → `default_tables_dir()` returning `"tables"` (line 156-158)
+- [x] Update `AmpYamlV1` struct similarly (line 141)
+- [x] Update `validate()` to reference "tables directory" instead of "models directory" (line 208)
+- [x] Update all tests using `models` field (lines 507-852 test module)
+
+**2.2) Update `discovery.rs`**
+- [x] Rename function `discover_models()` → `discover_tables()` (line 97)
+- [x] Update variable name `models` → `tables` in function body (line 108)
+- [x] Update `DiscoveryError` variants: `DuplicateModelName` → `DuplicateTableName`, etc.
+- [x] Update `DiscoveredModel` → `DiscoveredTable` struct
+- [x] Update documentation and error messages throughout
+
+**2.3) Update call sites**
+- [x] Update `validation.rs` field `discovered_models` → `discovered_tables` (line 105 in `ValidationResult`)
+- [x] Update `validate_network_inference()` parameters (line 591)
+- [x] Update all callers of discovery functions in validation/build flows
+- [x] Update CLI commands (check.rs, build.rs) to use new field/function names
+- [x] Update integration tests (it_dataset_authoring.rs) with new paths and imports
+
+**Acceptance criteria**: `amp.yaml` accepts `tables:` field (with `tables` as default). `models:` is no longer recognized. **VERIFIED**
+
+---
+
+### Phase 3: Build Output Layout (`sql/` → `tables/`)
+
+**Files**: `manifest.rs`, `package.rs`, `bridge.rs`, `arrow_json.rs`, CLI help text
+
+**3.1) Update `manifest.rs` output paths**
+- [x] Change SQL file path from `sql/.sql` to `tables/.sql` (line 406)
+- [x] Change schema file path from `sql/.schema.json` to `tables/.ipc` (line 415)
+- [x] Update `sql_dir` parameter naming throughout to `tables_dir`
+- [x] Update `ManifestBuilder` field `sql_dir: &'a Path` → `tables_dir: &'a Path` (line 286)
+- [x] Update `ManifestBuilder::new()` parameter (line 306)
+- [x] Update all test fixtures using `sql/` paths (lines 819-934 tests)
+
+**3.2) Update `package.rs`**
+- [x] Change directory inclusion from `sql/` to `tables/` (lines 175-178)
+- [x] Update `from_directory()` to look for `tables/` instead of `sql/`
+- [x] Update all test fixtures and assertions (lines 613-651 tests)
+
+**3.3) Update `bridge.rs`**
+- [x] Update all path references from `sql/` to `tables/`
+- [x] Update test fixtures to use IPC schema files (completed in Phase 6)
+
+**3.4) Update `arrow_json.rs` → deprecate or remove**
+- [x] After IPC is working, remove JSON schema write calls from build flow
+- [x] Keep `arrow_json.rs` only if needed for legacy adapter layer in Phase 6 (confirmed needed)
+
+**Acceptance criteria**: `ampctl dataset build` produces `tables/.sql` + `tables/.ipc`, no `sql/` directory. **VERIFIED**
+
+---
+
+### Phase 4: Schema Type Refactor (Arrow-native)
+
+**Files**: `schema.rs`, `validation.rs`, `dependency_manifest.rs`, build commands
+
+**4.1) Update schema inference in `schema.rs`**
+- [x] Change return type from `TableSchema` to Arrow `SchemaRef`
+- [x] Remove intermediate `TableSchema`/`ArrowSchema` conversions
+- [x] Update `SchemaContext::infer_schema()` to return `SchemaRef` directly
+
+**4.2) Update build pipeline**
+- [x] Write inferred schemas using `arrow_ipc::write_ipc_schema()` instead of JSON
+- [x] Update validation output to use Arrow types (`ValidatedTable.schema: SchemaRef`)
+- [x] Build command now uses `SchemaRef` directly without conversion
+
+**4.3) Update `dependency_manifest.rs`**
+- [x] `DependencyTable.schema` stays as `TableSchema` for JSON serialization (needed for legacy adapter in Phase 6)
+- [x] Added `From<&SchemaRef> for ArrowSchema` to enable conversion from native to serializable format
+
+**Acceptance criteria**: No `TableSchema`/`ArrowSchema` usage in authoring pipeline (only in adapter layer). **VERIFIED**
+
+---
+
+### Phase 5: Manifest Table Shape Changes
+
+**Files**: `manifest.rs` (TableDef struct and builder)
+
+**5.1) Update `TableDef` in `manifest.rs`**
+- [x] Change `schema: FileRef` to `ipc: FileRef` (line 173)
+- [x] Make `sql: FileRef` optional: `sql: Option` (line 171)
+- [x] Add `#[serde(skip_serializing_if = "Option::is_none")]` to `sql` field
+- [x] Update `ManifestError::SchemaFileRef` → `ManifestError::IpcFileRef` (lines 78-87)
+
+**5.2) Update manifest builder**
+- [x] Update `build_tables()` to create `TableDef` with optional `sql` (lines 401-438)
+- [x] Handle derived tables: write both `sql: Some(...)` and `ipc: ...`
+- [x] Prepare for raw tables: `sql: None`, only `ipc` field (future support)
+
+**5.3) Update tests**
+- [x] Update `table_def_serializes_correctly` test (lines 727-753)
+- [x] Add test for optional SQL field serialization
+- [x] Add test for raw table (no SQL) serialization
+
+**Acceptance criteria**: Manifest JSON has `"ipc"` field instead of `"schema"`. SQL field can be optional.
+
+---
+
+### Phase 6: Adapter Layer - Legacy ↔ Package
+
+**Files**: `bridge.rs` (extend), `cache.rs` (adapter calls), `resolver.rs` (use adapter)
+
+**6.1) Admin API fetch adapter (legacy → package)**
+- [x] Create adapter function to convert legacy manifest JSON to canonical package format
+- [x] When fetching from admin API:
+ - Parse legacy `manifest.json`
+ - Extract inline SQL content → write to `tables/.sql`
+ - Convert inline schema JSON → write to `tables/.ipc`
+ - Copy function sources → `functions/`
+ - Write canonical `manifest.json` with file refs
+- [x] Store canonical package in cache directory (via `LegacyAdapter` writing to target dir)
+
+**6.2) Register adapter (package → legacy)**
+- [x] Create adapter function to convert package format to legacy manifest JSON
+- [x] When registering via `--package`:
+ - Read `tables/.ipc` → convert to legacy schema JSON
+ - Read `tables/.sql` content
+ - Read `functions/` sources
+ - Build legacy manifest JSON with inline content
+- [x] Upload legacy manifest to admin API (existing functionality via `LegacyBridge::to_json()`)
+
+**6.3) Constrain legacy parsing**
+- [x] Ensure all legacy JSON schema parsing is confined to adapter layer
+- [x] Update resolver to use LegacyAdapter for derived datasets (kind="manifest")
+- [x] Raw datasets (evm-rpc, firehose, etc.) parsed directly into DependencyManifest
+
+**Acceptance criteria**:
+- Fetching legacy manifests populates cache with canonical package format.
+- Registering a package produces valid legacy manifest JSON for the API.
+
+---
+
+### Phase 7: Cache Updates
+
+**Files**: `cache.rs`, `resolver.rs`, `dependency_manifest.rs`
+
+**7.1) Update `cache.rs`**
+- [x] Cache structure stores full package format (via LegacyAdapter):
+ - `/manifest.json` (DependencyManifest)
+ - `/tables/.sql`
+ - `/tables/.ipc`
+ - `/functions/.js`
+- [x] Add `CachedPackage` struct with methods:
+ - `manifest()` - returns `&DependencyManifest`
+ - `read_sql(table_name)` - reads SQL content
+ - `read_schema(table_name)` - reads Arrow SchemaRef from IPC file
+ - `read_function(filename)` - reads function source
+ - `has_sql()`, `has_ipc_schema()` - existence checks
+- [x] Add `Cache::get_package()` returning `Option`
+- [x] Keep existing `Cache::get()` for backward compatibility (returns `DependencyManifest`)
+- [x] Add error variants for IPC/SQL/function file reads
+
+**7.2) Update `resolver.rs`** (No changes needed)
+- [x] Resolver already works correctly:
+ - Uses `LegacyAdapter` to write full package to cache directory
+ - Uses `Cache::put()` to store `DependencyManifest`
+ - Uses `Cache::get()` to retrieve `DependencyManifest` for resolution
+ - Consumers can use `Cache::get_package()` when IPC access is needed
+
+**7.3) Update `dependency_manifest.rs`** (Deferred)
+- [x] `DependencyTable.schema` keeps `TableSchema` (JSON-serializable) for compatibility
+- [x] Consumers needing Arrow SchemaRef use `CachedPackage::read_schema()`
+- Note: Lazy loading from IPC considered but not needed - current approach works
+
+**Acceptance criteria**: Cache stores and retrieves full canonical packages, not just manifest JSON. **VERIFIED**
+
+---
+
+### Phase 8: Documentation & Tests
+
+**Files**: `docs/features/dataset-authoring.md`, `tests/src/tests/it_dataset_authoring.rs`, CLI help markdown files, `playground/`
+
+**8.1) Update `docs/features/dataset-authoring.md`**
+- [x] Replace all `models/` references with `tables/` (currently 11 occurrences)
+- [x] Update build output structure section (currently shows `sql/` layout)
+- [x] Update `amp.yaml` schema documentation (config field `models` → `tables`)
+- [x] Document `.ipc` schema format (replace `.schema.json` references)
+- [x] Update CLI examples
+
+**8.2) Update tests**
+- [x] Update all fixture paths from `sql/` to `tables/`
+- [x] Update all `.schema.json` references to `.ipc`
+- [x] Add IPC round-trip tests (already exist in arrow_ipc.rs)
+- [x] Add adapter layer tests for legacy conversion (already exist in bridge.rs)
+- [x] Ensure coverage of new table shape (optional sql) (already exist in manifest.rs)
+
+**8.3) Update CLI help text**
+- [x] Update `ampctl dataset` subcommand help for `tables/` directory (package.rs docstrings updated)
+
+**8.4) Update playground sample**
+- [x] Delete `playground/build/` directory (gitignored, not tracked)
+- [x] Rename `playground/models/` to `playground/tables/`
+- [x] Update `playground/amp.yaml` to use `tables:` field (relies on new default)
+- [x] Regenerate `playground/build/` with new structure (gitignored)
+
+**8.5) Update module docstrings**
+- [x] Update `lib.rs` docstring (lines 1-29) mentioning `models/` and `sql/`
+- [x] Update `manifest.rs` docstring (lines 1-41) with example JSON
+
+**Acceptance criteria**: Docs, tests, and samples are consistent with new implementation. **VERIFIED**
+
+---
+
+## File Annotations
+
+Quick reference for key files and line numbers (verified 2026-02-04):
+
+| File | Key Lines | Purpose |
+|------|-----------|---------|
+| `config.rs` | 87-118, 156-158 | `AmpYaml` struct, `default_models_dir()` |
+| `discovery.rs` | 97-162 | `discover_models()` function |
+| `manifest.rs` | 168-176, 282-292 | `TableDef` struct, `ManifestBuilder` |
+| `package.rs` | 164-187 | `from_directory()` reads `sql/` and `functions/` |
+| `cache.rs` | 131-133, 145-163, 173-200 | `manifest_path()`, `get()`, `put()` |
+| `arrow_json.rs` | 74-104 | `write_schema_file()`, `read_schema_file()` |
+| `validation.rs` | 105, 429-470, 591 | `discovered_models` field and usage |
+| `lib.rs` | 1-29 | Module docstring with workflow description |
+| `files.rs` | docstrings, tests | Path examples use `sql/` throughout |
+| `bridge.rs` | tests | Test fixtures use `sql/` paths extensively |
+| `playground/amp.yaml` | 1-8 | Sample config using `models` default |
+
+---
+
+## Implementation Order
+
+**Recommended sequence** (minimizes rework):
+
+1. **Phase 1** - Arrow IPC module (no dependencies, foundational)
+2. **Phase 5.1** - TableDef field rename (`schema` → `ipc`, optional `sql`)
+3. **Phase 3** - Build output layout change (`sql/` → `tables/`)
+4. **Phase 2** - Config rename (`models` → `tables`)
+5. **Phase 4** - Schema type refactor (use Arrow-native types)
+6. **Phase 6** - Adapter layer (legacy conversion)
+7. **Phase 7** - Cache format update
+8. **Phase 8** - Documentation and tests
+
+Each phase should be completable in one commit/PR.
+
+---
+
+## Answered Questions
+
+1. **Deprecation period for `models`?** - No backwards compatibility. No deprecation period. None of this code is released yet.
+2. **Cache migration?** - No migration. Clear old cache entries. No backwards compatibility. None of this code is released yet.
+3. **Raw table authoring?** - Raw datasets are currently defined by extractor code. Eventually this will change and raw datasets will also have their table schemas declared and discoverable in a registry. But this is out of scope for this plan.
+
+---
+
+## Blockers
+
+None identified. All dependencies (Arrow, IPC support) are already available in the workspace.
+- Arrow IPC verified in use at: `crates/services/server/src/flight.rs:595`, `crates/clients/flight/src/store/mod.rs:195`
+
+---
+
+## Next Steps
+
+All phases complete. The refactor is done:
+- Arrow IPC schema format in use
+- `tables/` directory layout
+- `tables:` config field (with `tables` default)
+- Legacy adapter layer for API interop
+- Full package caching
+
+Ready for review and merge.
diff --git a/crates/bin/ampctl/Cargo.toml b/crates/bin/ampctl/Cargo.toml
index 10c1c4668..979f593b1 100644
--- a/crates/bin/ampctl/Cargo.toml
+++ b/crates/bin/ampctl/Cargo.toml
@@ -17,8 +17,12 @@ path = "src/main.rs"
admin-client = { path = "../../clients/admin" }
amp-object-store = { path = "../../core/object-store" }
anyhow.workspace = true
+async-trait.workspace = true
clap.workspace = true
+common = { path = "../../core/common" }
console = "0.16.1"
+datafusion.workspace = true
+dataset-authoring = { path = "../../core/dataset-authoring" }
datasets-common = { path = "../../core/datasets-common" }
datasets-derived = { path = "../../core/datasets-derived" }
dump = { path = "../../core/dump" }
diff --git a/crates/bin/ampctl/src/cmd/dataset.rs b/crates/bin/ampctl/src/cmd/dataset.rs
index 320de3084..159b3c5f8 100644
--- a/crates/bin/ampctl/src/cmd/dataset.rs
+++ b/crates/bin/ampctl/src/cmd/dataset.rs
@@ -1,9 +1,13 @@
//! Dataset management commands
+mod authoring;
+pub mod build;
+pub mod check;
pub mod deploy;
pub mod inspect;
pub mod list;
pub mod manifest;
+pub mod package;
pub mod register;
pub mod restore;
pub mod versions;
@@ -11,6 +15,28 @@ pub mod versions;
/// Dataset management subcommands.
#[derive(Debug, clap::Subcommand)]
pub enum Commands {
+ /// Build a dataset from amp.yaml configuration
+ ///
+ /// Parses the amp.yaml configuration, resolves dependencies, renders
+ /// Jinja SQL templates, validates CTAS statements, infers schemas,
+ /// and writes output files to the target directory.
+ Build(build::Args),
+
+ /// Validate dataset authoring inputs without building artifacts
+ ///
+ /// Parses the amp.yaml/amp.yml configuration, resolves dependencies,
+ /// renders Jinja SQL templates, validates SELECT statements, and
+ /// infers schemas without writing output files.
+ #[command(after_help = include_str!("dataset/check__after_help.md"))]
+ Check(check::Args),
+
+ /// Package built dataset artifacts into a .tgz archive
+ ///
+ /// Creates a deterministic archive containing manifest.json, SQL files,
+ /// schemas, and function sources for distribution or deployment.
+ #[command(alias = "pkg")]
+ Package(package::Args),
+
/// Deploy a dataset to start syncing blockchain data
///
/// Deploys a dataset version by scheduling a data extraction job via the
@@ -59,6 +85,9 @@ pub enum Commands {
/// Execute the dataset command with the given subcommand.
pub async fn run(command: Commands) -> anyhow::Result<()> {
match command {
+ Commands::Build(args) => build::run(args).await?,
+ Commands::Check(args) => check::run(args).await?,
+ Commands::Package(args) => package::run(args).await?,
Commands::Deploy(args) => deploy::run(args).await?,
Commands::Register(args) => register::run(args).await?,
Commands::List(args) => list::run(args).await?,
diff --git a/crates/bin/ampctl/src/cmd/dataset/authoring.rs b/crates/bin/ampctl/src/cmd/dataset/authoring.rs
new file mode 100644
index 000000000..c68baabc7
--- /dev/null
+++ b/crates/bin/ampctl/src/cmd/dataset/authoring.rs
@@ -0,0 +1,140 @@
+//! Shared helpers for dataset authoring commands.
+
+use std::{
+ collections::BTreeMap,
+ path::{Path, PathBuf},
+ sync::Arc,
+};
+
+use async_trait::async_trait;
+use dataset_authoring::{
+ lockfile::{Lockfile, LockfileError, RootInfo},
+ resolver::{ManifestFetcher, ResolveError},
+};
+use datasets_common::{hash::Hash, reference::Reference};
+use datasets_derived::deps::{DepAlias, DepReference, HashOrVersion};
+
+use crate::args::GlobalArgs;
+
+/// Parse a NAME=VALUE string into a tuple.
+pub(super) fn parse_var(s: &str) -> Result<(String, String), String> {
+ let pos = s
+ .find('=')
+ .ok_or_else(|| format!("invalid variable format: expected NAME=VALUE, got '{s}'"))?;
+ Ok((s[..pos].to_string(), s[pos + 1..].to_string()))
+}
+
+pub(super) fn dependencies_have_version_refs(
+ dependencies: &BTreeMap,
+) -> bool {
+ dependencies
+ .values()
+ .any(|reference| matches!(reference.revision(), HashOrVersion::Version(_)))
+}
+
+pub(super) fn load_lockfile_for_locked(
+ lockfile_path: &Path,
+ config: &dataset_authoring::config::AmpYaml,
+) -> Result {
+ if !lockfile_path.exists() {
+ return Err(LockfileLoadError::LockfileRequired);
+ }
+
+ let lockfile = Lockfile::read(lockfile_path).map_err(LockfileLoadError::Lockfile)?;
+ let root = RootInfo {
+ namespace: config.namespace.clone(),
+ name: config.name.clone(),
+ version: config.version.clone(),
+ };
+ lockfile
+ .verify_root(&root)
+ .map_err(LockfileLoadError::Lockfile)?;
+ lockfile
+ .verify_dependencies(&config.dependencies)
+ .map_err(LockfileLoadError::Lockfile)?;
+
+ Ok(lockfile)
+}
+
+pub(super) fn load_lockfile_for_offline(
+ lockfile_path: &Path,
+) -> Result {
+ if !lockfile_path.exists() {
+ return Err(LockfileLoadError::OfflineLockfileRequired(
+ lockfile_path.to_path_buf(),
+ ));
+ }
+
+ Lockfile::read(lockfile_path).map_err(LockfileLoadError::Lockfile)
+}
+
+/// Admin API manifest fetcher.
+pub(super) struct AdminApiFetcher {
+ client: Arc,
+}
+
+impl AdminApiFetcher {
+ pub(super) fn new(global: &GlobalArgs) -> Result {
+ let client = global.build_client()?;
+ Ok(Self {
+ client: Arc::new(client),
+ })
+ }
+}
+
+#[async_trait]
+impl ManifestFetcher for AdminApiFetcher {
+ async fn fetch_by_version(
+ &self,
+ reference: &DepReference,
+ ) -> Result {
+ // Convert DepReference to Reference for the datasets client
+ let ref_str = reference.to_string();
+ let datasets_ref: Reference = ref_str.parse().map_err(|e| ResolveError::FetchError {
+ reference: ref_str.clone(),
+ message: format!("invalid reference format: {e}"),
+ })?;
+
+ // Use the datasets client to fetch manifest by reference
+ let result = self
+ .client
+ .datasets()
+ .get_manifest(&datasets_ref)
+ .await
+ .map_err(|e| ResolveError::FetchError {
+ reference: ref_str.clone(),
+ message: e.to_string(),
+ })?;
+
+ result.ok_or(ResolveError::NotFound { reference: ref_str })
+ }
+
+ async fn fetch_by_hash(&self, hash: &Hash) -> Result