diff --git a/manifest.json b/manifest.json index 74657f0..3bccc50 100644 --- a/manifest.json +++ b/manifest.json @@ -1,12 +1,25 @@ { "version": "2", - "updated_at": "2026-03-31T18:36:08Z", + "updated_at": "2026-03-31T11:40:34Z", "skills": { + "databricks": { + "version": "0.1.0", + "description": "Core Databricks skill for CLI, auth, and data exploration", + "experimental": false, + "updated_at": "2026-03-31T11:37:22Z", + "files": [ + "SKILL.md", + "data-exploration.md", + "databricks-cli-auth.md", + "databricks-cli-install.md", + "declarative-automation-bundles.md" + ] + }, "databricks-apps": { "version": "0.1.1", "description": "Databricks Apps development and deployment", "experimental": false, - "updated_at": "2026-03-31T18:35:18Z", + "updated_at": "2026-03-31T11:37:22Z", "files": [ "SKILL.md", "references/appkit/appkit-sdk.md", @@ -14,30 +27,19 @@ "references/appkit/lakebase.md", "references/appkit/overview.md", "references/appkit/sql-queries.md", + "references/appkit/proto-contracts.md", + "references/appkit/proto-first.md", "references/appkit/trpc.md", "references/other-frameworks.md", "references/platform-guide.md", "references/testing.md" ] }, - "databricks-core": { - "version": "0.1.0", - "description": "Core Databricks skill for CLI, auth, and data exploration", - "experimental": false, - "updated_at": "2026-03-31T18:34:21Z", - "files": [ - "SKILL.md", - "data-exploration.md", - "databricks-cli-auth.md", - "databricks-cli-install.md", - "declarative-automation-bundles.md" - ] - }, "databricks-jobs": { "version": "0.1.0", "description": "Databricks Jobs orchestration and scheduling", "experimental": false, - "updated_at": "2026-03-31T18:35:12Z", + "updated_at": "2026-03-31T11:37:22Z", "files": [ "SKILL.md" ] @@ -46,7 +48,7 @@ "version": "0.1.0", "description": "Databricks Lakebase database development", "experimental": false, - "updated_at": "2026-03-31T18:34:30Z", + "updated_at": "2026-03-31T11:37:22Z", "files": [ "SKILL.md" ] @@ -55,7 +57,7 @@ "version": "0.1.0", "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": false, - "updated_at": "2026-03-31T18:35:15Z", + "updated_at": "2026-03-31T11:37:22Z", "files": [ "SKILL.md", "references/auto-cdc-python.md", diff --git a/scripts/generate_manifest.py b/scripts/generate_manifest.py index 8fafdd4..5495899 100644 --- a/scripts/generate_manifest.py +++ b/scripts/generate_manifest.py @@ -73,6 +73,10 @@ def get_skill_updated_at(skill_path: Path) -> str: "description": "Databricks Pipelines (DLT) for ETL and streaming", "experimental": False, }, + "databricks-proto-first": { + "description": "Proto-first schema design for Databricks apps", + "experimental": False, + }, } diff --git a/skills/databricks-apps/SKILL.md b/skills/databricks-apps/SKILL.md index b56a9c6..780c97d 100644 --- a/skills/databricks-apps/SKILL.md +++ b/skills/databricks-apps/SKILL.md @@ -23,6 +23,7 @@ Build apps that deploy to Databricks Apps platform. | Using `useAnalyticsQuery` | [AppKit SDK](references/appkit/appkit-sdk.md) | | Adding API endpoints | [tRPC Guide](references/appkit/trpc.md) | | Using Lakebase (OLTP database) | [Lakebase Guide](references/appkit/lakebase.md) | +| Typed data contracts (proto-first design) | [Proto-First Guide](references/appkit/proto-first.md) and [Plugin Contracts](references/appkit/proto-contracts.md) | | Platform rules (permissions, deployment, limits) | [Platform Guide](references/platform-guide.md) — READ for ALL apps including AppKit | | Non-AppKit app (Streamlit, FastAPI, Flask, Gradio, Next.js, etc.) | [Other Frameworks](references/other-frameworks.md) | diff --git a/skills/databricks-apps/references/appkit/proto-contracts.md b/skills/databricks-apps/references/appkit/proto-contracts.md new file mode 100644 index 0000000..35f42e6 --- /dev/null +++ b/skills/databricks-apps/references/appkit/proto-contracts.md @@ -0,0 +1,201 @@ +# Plugin Contract Reference + +Concrete proto↔plugin mappings for the three core AppKit plugins. + +## Files Plugin Contract + +**Plugin manifest**: `files/manifest.json` +**Resource**: UC Volume with `WRITE_VOLUME` permission +**Env**: `DATABRICKS_VOLUME_FILES` for volume path + +### Boundary: What the files plugin owns + +The files plugin is the ONLY module that touches UC Volumes. Other modules +interact with files through typed proto messages, never raw paths. + +``` +┌─────────────┐ UploadRequest ┌──────────────┐ +│ api module │ ──────────────────→ │ files plugin │ +│ │ ←────────────────── │ │ +│ │ StoredArtifact │ UC Volumes │ +└─────────────┘ └──────────────┘ +``` + +### Proto → Plugin Method Mapping + +| Proto Message | Plugin Method | Direction | +|---------------|---------------|-----------| +| `UploadRequest` | `files.upload(path, content, opts)` | IN | +| `StoredArtifact` | Return type of upload/getInfo | OUT | +| `VolumeLayout` | `files.config.volumePath` + conventions | CONFIG | + +### Volume Path Convention (from VolumeLayout proto) + +``` +/Volumes/{catalog}/{schema}/{volume}/ +├── uploads/ # User uploads (UploadRequest.destination_path) +├── results/ # Computed outputs (StoredArtifact) +│ └── {run_id}/ +│ ├── output.proto.bin # Binary proto serialization +│ └── output.json # JSON for debugging +└── artifacts/ # Build artifacts, archives + └── {app_name}/ + └── {version}/ +``` + +### Config ↔ Proto Mapping + +| manifest.json field | Proto field | Notes | +|---------------------|-------------|-------| +| `config.timeout` (30000) | Not in proto | Plugin-internal config | +| `config.maxUploadSize` (5GB) | `UploadRequest.content` max size | Validation constraint | +| `resources.path` env | `VolumeLayout.root` | Runtime injection | + +--- + +## Lakebase Plugin Contract + +**Plugin manifest**: `lakebase/manifest.json` +**Resource**: Postgres with `CAN_CONNECT_AND_CREATE` permission +**Env**: `PGHOST`, `PGDATABASE`, `PGPORT`, `PGSSLMODE`, `LAKEBASE_ENDPOINT` + +### Boundary: What the lakebase plugin owns + +Lakebase owns ALL structured data. Every table's schema is derived from a proto +message in `database.proto`. No ad-hoc `CREATE TABLE` statements. + +``` +┌─────────────┐ RunRecord ┌──────────────┐ +│ compute mod │ ──────────────────→ │ lakebase │ +│ │ │ plugin │ +│ │ MetricRecord │ │ +│ │ ──────────────────→ │ Postgres │ +└─────────────┘ └──────┬───────┘ + │ +┌─────────────┐ SQL query │ +│ analytics │ ←──────────────────────────┘ +│ module │ RunRecord[] +└─────────────┘ +``` + +### Proto → Table Mapping + +| Proto Message | Table Name | Primary Key | Notes | +|---------------|-----------|-------------|-------| +| `RunRecord` | `runs` | `(run_id, app_name)` | One row per run | +| `MetricRecord` | `metrics` | auto-increment | FK to runs.run_id | +| `ConfigRecord` | `configs` | `config_id` | Versioned configs | + +### Proto → DDL Type Mapping + +| Proto Type | SQL Type | Column Default | +|-----------|----------|----------------| +| `string` | `TEXT` | `''` | +| `bool` | `BOOLEAN` | `false` | +| `int32` | `INTEGER` | `0` | +| `int64` | `BIGINT` | `0` | +| `double` | `DOUBLE PRECISION` | `0.0` | +| `bytes` | `BYTEA` | `NULL` | +| `Timestamp` | `TIMESTAMPTZ` | `NOW()` | +| `repeated T` | `JSONB` | `'[]'::jsonb` | +| `map` | `JSONB` | `'{}'::jsonb` | +| nested message | `JSONB` | `NULL` | +| `enum` | `TEXT` | First value name | + +### Migration Convention + +``` +migrations/ +├── 001_create_runs.sql +├── 002_create_metrics.sql +├── 003_create_configs.sql +└── 004_add_metrics_index.sql +``` + +Each migration is idempotent (`CREATE TABLE IF NOT EXISTS`, `CREATE INDEX IF NOT EXISTS`). + +### Config ↔ Proto Mapping + +| manifest.json field | Proto usage | Notes | +|---------------------|-------------|-------| +| `resources.branch` | Not in proto | Infrastructure config | +| `resources.database` | Not in proto | Infrastructure config | +| `resources.host` (`PGHOST`) | Connection string | Runtime injection | +| `resources.databaseName` (`PGDATABASE`) | Database selection | Runtime injection | + +--- + +## Jobs / Compute Contract + +**No plugin manifest** — Jobs are invoked via `@databricks/sdk-experimental` +**Resource**: Databricks Jobs API +**Auth**: Workspace token or OAuth + +### Boundary: What the jobs module owns + +The jobs module owns compute execution. It receives typed task inputs, runs them +on Databricks clusters, and produces typed task outputs. + +``` +┌─────────────┐ JobConfig ┌──────────────┐ +│ api module │ ──────────────────→ │ jobs module │ +│ │ │ │ +│ │ JobTaskInput │ Databricks │ +│ │ ──────────────────→ │ Jobs API │ +│ │ │ │ +│ │ JobTaskOutput │ Clusters │ +│ │ ←────────────────── │ │ +└─────────────┘ └──────────────┘ +``` + +### Proto → Jobs SDK Mapping + +| Proto Message | SDK Method | Direction | +|---------------|-----------|-----------| +| `JobConfig` | `jobs.create(config)` | IN — defines the job | +| `TaskConfig` | Task within a job | IN — defines task deps | +| `JobTaskInput` | Task params (base64 proto) | IN — task receives | +| `JobTaskOutput` | Task output (written to Volume) | OUT — task produces | + +### Task Parameter Convention + +Job tasks receive their typed input via: +1. **Small payloads (<256KB)**: Base64-encoded proto in task params +2. **Large payloads**: Proto binary written to UC Volume, path passed as param + +```typescript +// Producer (api module) +const input: JobTaskInput = { taskId, taskType, runId, inputPayload }; +const encoded = Buffer.from(JobTaskInput.encode(input).finish()).toString('base64'); +// Pass as notebook parameter: { "input": encoded } + +// Consumer (job task code) +const decoded = JobTaskInput.decode(Buffer.from(params.input, 'base64')); +``` + +### Task Output Convention + +Job tasks write their typed output to: +``` +/Volumes/{catalog}/{schema}/{volume}/results/{run_id}/{task_id}.output.bin +``` + +The output is a serialized `JobTaskOutput` proto. The orchestrator reads it +back with the generated decoder. + +### Jobs API Patterns + +```typescript +// Create a multi-task job from JobConfig proto +const jobConfig: JobConfig = { + jobName: `${appName}-${runId}`, + clusterSpec: '{"num_workers": 1}', + maxRetries: 2, + timeoutSeconds: 3600, + tasks: [ + { taskKey: 'generate', taskType: 'generate', dependsOn: [] }, + { taskKey: 'evaluate', taskType: 'evaluate', dependsOn: ['generate'] }, + { taskKey: 'aggregate', taskType: 'aggregate', dependsOn: ['evaluate'] }, + ], +}; +``` diff --git a/skills/databricks-apps/references/appkit/proto-first.md b/skills/databricks-apps/references/appkit/proto-first.md new file mode 100644 index 0000000..3f8e9d8 --- /dev/null +++ b/skills/databricks-apps/references/appkit/proto-first.md @@ -0,0 +1,306 @@ +# Proto-First App Design + +Schema-first approach for AppKit apps using protobuf data contracts. Define contracts BEFORE implementation — derive TypeScript types, Lakebase DDL, and Volume paths from `.proto` files. + +**When to use:** New apps with multiple plugins (files + lakebase + jobs), or adding typed boundaries to existing apps. Skip for quick prototypes. + +**Requires:** `buf` CLI for proto linting and code generation. + +**Rule: No implementation before contracts. No contracts without consumers.** + +Define protobuf data contracts FIRST, then derive everything else (TypeScript types, Lakebase DDL, Volume paths, API shapes) from those contracts. + +## When to Use + +| Scenario | Use this skill | +|----------|---------------| +| Creating a new Databricks app | YES — define contracts before `databricks apps init` | +| Adding a new data boundary to an existing app | YES — add proto before implementation | +| Quick prototype / hackathon | NO — skip contracts, move fast | +| Modifying existing typed code | NO — contracts already exist | + +## Core Principle + +``` +User intent → Module map → Proto contracts → Generated types → Implementation + ↓ ↓ + Lakebase DDL TypeScript interfaces + ↓ ↓ + Migrations Plugin code +``` + +The `.proto` file is the single source of truth. If it's not in a proto, it doesn't cross a module boundary. + +## Phase 1: Decompose into Modules + +Every Databricks app decomposes into a combination of these plugin modules: + +| Module | Plugin | Data Boundary | Owns | +|--------|--------|---------------|------| +| **Storage** | files | UC Volumes | Blobs, uploads, artifacts, archives | +| **Database** | lakebase | Postgres tables | Structured records, queries, migrations | +| **Compute** | jobs | Databricks Jobs API | Job runs, task results, cluster configs | +| **Analytics** | analytics | SQL Warehouse | Read-only queries, dashboards | +| **Serving** | server | HTTP/tRPC routes | API endpoints, SSE streams | + +### Decomposition Rules + +1. **Each module owns its data** — files plugin never writes to lakebase, lakebase never writes to volumes. +2. **Cross-module communication is typed** — a proto message, never a raw JSON blob. +3. **Every proto message has exactly one producer module.** +4. **Multiple modules can consume** — but the producer defines the schema. +5. **No god messages** — if a message has >12 fields, split it. + +### Output: Module Map + +Before proceeding, produce a module map for the user to confirm: + +``` +App: +Modules: + storage: files plugin → uploads/, results/, artifacts/ + db: lakebase plugin → runs, metrics, configs tables + compute: jobs → generation tasks, eval tasks + api: server plugin → POST /run, GET /status, SSE /stream +``` + +## Phase 2: Define Proto Contracts + +### Directory Structure + +``` +proto/ +├── buf.yaml +├── buf.gen.yaml +└── / + └── v1/ + ├── common.proto # Shared enums, IDs + ├── storage.proto # Files plugin boundary + ├── database.proto # Lakebase plugin boundary + ├── compute.proto # Jobs boundary + └── api.proto # Server/API boundary +``` + +### Proto Style Rules + +- **Package**: `.v1` (versioned from day one) +- **One file per module boundary**, not per message +- **Every field has a consumer** — if no code reads it, delete it +- **snake_case** for all field names +- **proto3** syntax only + +### Files Plugin Boundary (`storage.proto`) + +The files plugin operates on UC Volumes. Type every file path and payload: + +```protobuf +syntax = "proto3"; +package .v1; + +import "google/protobuf/timestamp.proto"; + +// StoredArtifact — produced by files plugin after upload. +message StoredArtifact { + string volume_path = 1; + string content_type = 2; + int64 size_bytes = 3; + google.protobuf.Timestamp created_at = 4; + string checksum_sha256 = 5; +} + +// UploadRequest — sent to files plugin by api module. +message UploadRequest { + string destination_path = 1; + string content_type = 2; + bytes content = 3; + map metadata = 4; +} + +// VolumeLayout — design-time contract for volume directory structure. +message VolumeLayout { + string root = 1; // /Volumes/catalog/schema/app_name + string uploads_dir = 2; // uploads/ + string results_dir = 3; // results/ + string artifacts_dir = 4; // artifacts/ +} +``` + +### Lakebase Plugin Boundary (`database.proto`) + +Every Lakebase table has a corresponding proto message. The message IS the schema: + +```protobuf +syntax = "proto3"; +package .v1; + +import "google/protobuf/timestamp.proto"; + +// RunRecord — one row in the `runs` table. +// Producer: compute module. Consumers: api, analytics. +message RunRecord { + string run_id = 1; + string app_name = 2; + RunStatus status = 3; + google.protobuf.Timestamp started_at = 4; + google.protobuf.Timestamp completed_at = 5; + string error_message = 6; + string config_json = 7; +} + +// MetricRecord — one row in the `metrics` table. +// Producer: compute module. Consumers: analytics, api. +message MetricRecord { + string run_id = 1; + string metric_name = 2; + double value = 3; + google.protobuf.Timestamp recorded_at = 4; + map dimensions = 5; +} +``` + +### Jobs Boundary (`compute.proto`) + +Type job task inputs and outputs: + +```protobuf +syntax = "proto3"; +package .v1; + +// JobTaskInput — typed payload sent to a Databricks job task. +// Producer: api module. Consumer: job task code. +message JobTaskInput { + string task_id = 1; + string task_type = 2; + string run_id = 3; + bytes input_payload = 4; + map env = 5; +} + +// JobTaskOutput — typed result from a completed job task. +// Producer: job task code. Consumer: api module. +message JobTaskOutput { + string task_id = 1; + string run_id = 2; + bool success = 3; + string error = 4; + bytes output_payload = 5; + int64 duration_ms = 6; + map metrics = 7; +} +``` + +## Phase 3: Generate Types and DDL + +### 3a. Buf configuration + +```yaml +# buf.yaml +version: v2 +lint: + use: + - STANDARD +breaking: + use: + - FILE +``` + +```yaml +# buf.gen.yaml +version: v2 +plugins: + - remote: buf.build/connectrpc/es + out: proto/gen + opt: target=ts +``` + +### 3b. Generate TypeScript types + +```bash +buf lint proto/ +buf generate proto/ +``` + +### 3c. Generate Lakebase DDL + +For each message in `database.proto`, generate a numbered migration file. + +**Proto→SQL type mapping:** + +| Proto Type | SQL Type | Default | +|-----------|----------|---------| +| `string` | `TEXT` | `''` | +| `bool` | `BOOLEAN` | `false` | +| `int32` | `INTEGER` | `0` | +| `int64` | `BIGINT` | `0` | +| `double` | `DOUBLE PRECISION` | `0.0` | +| `bytes` | `BYTEA` | `NULL` | +| `Timestamp` | `TIMESTAMPTZ` | `NOW()` | +| `repeated T` | `JSONB` | `'[]'::jsonb` | +| `map` | `JSONB` | `'{}'::jsonb` | +| nested message | `JSONB` | `NULL` | +| `enum` | `TEXT` | first value name | + +Example migration: + +```sql +-- migrations/001_create_runs.sql +CREATE TABLE IF NOT EXISTS runs ( + run_id TEXT NOT NULL, + app_name TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'RUN_STATUS_PENDING', + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + error_message TEXT, + config_json JSONB, + PRIMARY KEY (run_id, app_name) +); +``` + +### 3d. Validate + +```bash +npx tsc --noEmit # all generated types compile +buf lint proto/ # proto style checks +``` + +## Phase 4: Implement Against Contracts + +NOW implementation begins. Each module uses ONLY its generated types: + +```typescript +import type { StoredArtifact, UploadRequest } from '../proto/gen//v1/storage'; +import type { RunRecord, MetricRecord } from '../proto/gen//v1/database'; +import type { JobTaskInput, JobTaskOutput } from '../proto/gen//v1/compute'; +``` + +No `any`, no `unknown`, no `JSON.parse()` at module boundaries. + +## Validation Checklist + +Before writing implementation code: + +- [ ] Module map exists with clear data boundaries +- [ ] Proto files exist for every cross-boundary data structure +- [ ] `buf lint proto/` passes +- [ ] `buf generate proto/` produces TypeScript types +- [ ] Lakebase DDL derived from `database.proto` messages +- [ ] No proto message exceeds 12 fields +- [ ] Every field has at least one identified consumer +- [ ] Every message has exactly one producer module +- [ ] Volume layout documented (not freeform paths) +- [ ] Job inputs/outputs typed (no raw JSON params) + +## Common Traps + +| Trap | Why it fails | Fix | +|------|-------------|-----| +| "I'll add the proto later" | Boundaries calcify around untyped shapes | Proto first or not at all | +| `any` at a module boundary | Type errors surface at runtime, not compile time | Use generated types | +| `JSON.parse()` crossing a boundary | No schema validation | Deserialize with proto decoder | +| Giant 30-field message | Impossible to review, version, or extend | Split by concern, max 12 fields | +| Storing raw JSON in Lakebase | Loses queryability and type safety | Map to `repeated`, `map`, or nested message fields | +| Shared mutable state between modules | Race conditions, unclear ownership | Communicate through typed messages | + +## References + +- [Plugin Contract Details](references/plugin-contracts.md) — proto↔plugin type mappings for files, lakebase, jobs