Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions paradedb/sample-movie-search/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Dependencies
node_modules/

# Build output
dist/
cdk.out/
*.js
*.d.ts

# Keep TypeScript source
!jest.config.js

# IDE
.idea/
.vscode/
*.swp
*.swo

# OS
.DS_Store
Thumbs.db

# Logs
*.log
npm-debug.log*

# Local env
.env
.env.local

# Data
data/
107 changes: 107 additions & 0 deletions paradedb/sample-movie-search/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
.PHONY: install deploy init seed destroy clean help web-ui test-search get-api-url download-data

DATASET_URL := https://docs.aws.amazon.com/opensearch-service/latest/developerguide/samples/sample-movies.zip
DATA_DIR := data

help:
@echo "ParadeDB Movie Search Sample App"
@echo ""
@echo "Usage:"
@echo " make install - Install all dependencies"
@echo " make download-data - Download AWS sample movies dataset"
@echo " make deploy - Deploy CDK stack to LocalStack"
@echo " make init - Initialize database schema and BM25 index"
@echo " make seed - Load movie data from S3 into ParadeDB"
@echo " make web-ui - Run the Web UI on localhost port 3000"
@echo " make destroy - Tear down the stack"
@echo " make clean - Remove build artifacts"
@echo ""
@echo "Quick start:"
@echo " make install && make download-data && make deploy && make init && make seed && make web-ui"

install:
@echo "Installing CDK dependencies..."
npm install
@echo "Installing Lambda dependencies..."
cd lambda && npm install
@echo "Done!"

download-data:
@echo "Downloading AWS sample movies dataset..."
@mkdir -p $(DATA_DIR)
@curl -sL $(DATASET_URL) -o $(DATA_DIR)/sample-movies.zip
@echo "Extracting dataset..."
@unzip -o $(DATA_DIR)/sample-movies.zip -d $(DATA_DIR)/
@echo "Pre-processing bulk file (removing index instructions)..."
@grep -v '^{ "index"' $(DATA_DIR)/sample-movies.bulk > $(DATA_DIR)/movies.bulk
@rm -rf $(DATA_DIR)/sample-movies.zip $(DATA_DIR)/sample-movies.bulk $(DATA_DIR)/__MACOSX
@echo "Dataset ready: $(DATA_DIR)/movies.bulk"
@wc -l $(DATA_DIR)/movies.bulk | awk '{print "Total movies: " $$1}'

deploy:
@echo "Deploying MovieSearchStack to LocalStack..."
cdklocal bootstrap
cdklocal deploy --require-approval never
@echo ""
@echo "Deployment complete!"

init:
@echo "Initializing database schema and BM25 index..."
@API_URL=$$(awslocal cloudformation describe-stacks \
--stack-name MovieSearchStack \
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
--output text 2>/dev/null); \
if [ -z "$$API_URL" ]; then \
echo "Error: Stack not deployed. Run 'make deploy' first."; \
exit 1; \
fi; \
curl -s -X POST "$${API_URL}admin/init" | jq .
@echo "Database initialized!"

seed:
@echo "Seeding movie data from S3..."
@API_URL=$$(awslocal cloudformation describe-stacks \
--stack-name MovieSearchStack \
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
--output text 2>/dev/null); \
if [ -z "$$API_URL" ]; then \
echo "Error: Stack not deployed. Run 'make deploy' first."; \
exit 1; \
fi; \
curl -s -X POST "$${API_URL}admin/seed" | jq .
@echo "Data seeded!"

test-search:
@echo "Testing search endpoint..."
@API_URL=$$(awslocal cloudformation describe-stacks \
--stack-name MovieSearchStack \
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
--output text 2>/dev/null); \
if [ -z "$$API_URL" ]; then \
echo "Error: Stack not deployed. Run 'make deploy' first."; \
exit 1; \
fi; \
echo "Searching for 'redemption'..."; \
curl -s "$${API_URL}search?q=redemption" | jq .

destroy:
@echo "Destroying MovieSearchStack..."
cdklocal destroy --force
@echo "Stack destroyed!"

clean:
rm -rf node_modules lambda/node_modules cdk.out dist data/movies.bulk
@echo "Cleaned!"

get-api-url:
@awslocal cloudformation describe-stacks \
--stack-name MovieSearchStack \
--query 'Stacks[0].Outputs[?OutputKey==`ApiEndpoint`].OutputValue' \
--output text

web-ui:
@echo "Starting Movie Search Web UI..."
@echo "API endpoint: http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev"
@echo ""
@which serve > /dev/null 2>&1 || (echo "Installing serve..." && npm i -g serve)
serve -s ./web -l 3000
209 changes: 209 additions & 0 deletions paradedb/sample-movie-search/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# ParadeDB Movie Search Sample App

A CDK application demonstrating ParadeDB's full-text search capabilities with LocalStack.

## Overview

This sample app deploys a serverless movie search application using:

- **AWS Lambda** - Handles search and data operations
- **Amazon API Gateway** - REST API endpoints
- **Amazon S3** - Stores movie dataset
- **ParadeDB** - Full-text search engine (runs as LocalStack extension)

### Dataset

Uses the official [AWS OpenSearch sample movies dataset](https://docs.aws.amazon.com/opensearch-service/latest/developerguide/samples/sample-movies.zip) containing **5,000 movies** with metadata including:

- Title, year, genres, rating
- Directors and actors
- Plot descriptions
- Movie poster images
- Runtime duration

### Features Demonstrated

| Feature | Description |
|---------|-------------|
| **BM25 Ranking** | Industry-standard relevance scoring |
| **Fuzzy Matching** | Handles typos (e.g., "Godfater" finds "Godfather") |
| **Highlighting** | Returns matched text with highlighted terms |
| **Movie Posters** | Rich UI with movie poster images |

### API Endpoints

| Method | Endpoint | Description |
|--------|----------|-------------|
| GET | `/search?q=<query>` | Search movies with BM25 ranking |
| GET | `/movies/{id}` | Get movie details by ID |
| POST | `/admin/init` | Initialize database schema |
| POST | `/admin/seed` | Load movie data from S3 |

## Prerequisites

- [LocalStack](https://localstack.cloud/) installed and running
- [Node.js](https://nodejs.org/) 18+ installed
- [AWS CDK Local](https://github.com/localstack/aws-cdk-local) (`npm install -g aws-cdk-local`)
- [AWS CLI](https://aws.amazon.com/cli/) configured
- ParadeDB extension installed in LocalStack

## Setup

### 1. Start LocalStack with ParadeDB Extension

```bash
# Install the ParadeDB extension
localstack extensions install localstack-extension-paradedb

# Start LocalStack
localstack start
```

### 2. Install Dependencies and Download Dataset

```bash
cd paradedb/sample-movie-search
make install
make download-data
```

The `download-data` target downloads the AWS sample movies dataset (~5000 movies) and preprocesses it for ParadeDB ingestion.

### 3. Deploy the Stack

```bash
make deploy
```

Or manually:

```bash
cdklocal bootstrap
cdklocal deploy
```

After deployment, you'll see output similar to:

```
Outputs:
MovieSearchStack.ApiEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/
MovieSearchStack.DataBucketName = movie-search-data
MovieSearchStack.InitEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/admin/init
MovieSearchStack.MovieSearchApiEndpointB25066EC = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/
MovieSearchStack.MoviesEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/movies/{id}
MovieSearchStack.SearchEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search
MovieSearchStack.SeedEndpoint = https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/admin/seed
```

### 4. Initialize Database

Create the movies table and BM25 search index:

```bash
make init
```

### 5. Seed Data

Load movie data from S3 into ParadeDB:

```bash
make seed
```

## Usage

### Search Movies

```bash
# Basic search
curl "https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=redemption"

# With pagination
curl "https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=dark%20knight&limit=5&offset=0"

# Fuzzy search (handles typos)
curl "https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/search?q=godfater"
```

### Get Movie Details

```bash
curl "https://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev/movies/tt0111161"
```

### Example Response

```json
{
"success": true,
"data": {
"id": "tt0111161",
"title": "The Shawshank Redemption",
"year": 1994,
"genres": [
"Crime",
"Drama"
],
"rating": 9.3,
"directors": [
"Frank Darabont"
],
"actors": [
"Tim Robbins",
"Morgan Freeman",
"Bob Gunton"
],
"plot": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.",
"image_url": "https://m.media-amazon.com/images/M/MV5BODU4MjU4NjIwNl5BMl5BanBnXkFtZTgwMDU2MjEyMDE@._V1_SX400_.jpg",
"release_date": "1994-09-10T00:00:00.000Z",
"rank": 80,
"running_time_secs": 8520
}
}
```

## Web UI

A web UI with movie posters is included in the `web/` directory.

### Quick Start

```bash
make web-ui
```

This starts a local web server at http://localhost:3000. The UI automatically connects to the API Gateway at `http://movie-search-api.execute-api.localhost.localstack.cloud:4566/dev`.

<img width="2880" height="1402" alt="image" src="https://gist.github.com/user-attachments/assets/63986bfe-709b-4bde-bac8-4df2b15bd41a" />

## How It Works

1. **Dataset Preparation**: Download and preprocess the AWS OpenSearch sample movies dataset

2. **Deployment**: CDK creates Lambda functions, API Gateway, and S3 bucket with movie data (bulk format)

3. **Initialization**: The init Lambda creates the movies table and ParadeDB BM25 index:
```sql
CREATE INDEX movies_search_idx ON movies
USING bm25 (id, title, plot)
WITH (key_field = 'id');
```

4. **Data Loading**: The seed Lambda reads `movies.bulk` from S3 (newline-delimited JSON) and inserts 5000 movies into ParadeDB

5. **Search**: Queries use ParadeDB's BM25 search with fuzzy matching:
```sql
SELECT id, title, year, genres, rating, directors, actors, image_url, running_time_secs,
pdb.snippet(plot, start_tag => '<mark>', end_tag => '</mark>') as highlight,
pdb.score(id) as score
FROM movies
WHERE title ||| $1::pdb.fuzzy(1) OR plot ||| $1::pdb.fuzzy(1)
ORDER BY score DESC
```

## References

- [ParadeDB Documentation](https://docs.paradedb.com/)
- [LocalStack Extensions](https://docs.localstack.cloud/aws/tooling/extensions/)
- [AWS CDK Local](https://github.com/localstack/aws-cdk-local)
7 changes: 7 additions & 0 deletions paradedb/sample-movie-search/bin/app.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env node
import "source-map-support/register";
import * as cdk from "aws-cdk-lib";
import { MovieSearchStack } from "../lib/movie-search-stack";

const app = new cdk.App();
new MovieSearchStack(app, "MovieSearchStack", {});
21 changes: 21 additions & 0 deletions paradedb/sample-movie-search/cdk.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"app": "npx ts-node --prefer-ts-exts bin/app.ts",
"watch": {
"include": ["**"],
"exclude": [
"README.md",
"cdk*.json",
"**/*.d.ts",
"**/*.js",
"tsconfig.json",
"package*.json",
"node_modules",
"lambda/node_modules"
]
},
"context": {
"@aws-cdk/aws-lambda:recognizeLayerVersion": true,
"@aws-cdk/core:checkSecretUsage": true,
"@aws-cdk/core:target-partitions": ["aws", "aws-cn"]
}
}
Loading