From 8f9a63aca047d096493215b9d67c7469cadd1441 Mon Sep 17 00:00:00 2001 From: sonia Date: Sun, 15 Mar 2026 15:10:52 -0400 Subject: [PATCH] Add bootstrap schema and setup script for single-command onboarding Consolidate 12 incremental SQL migrations into supabase/schema.sql so new deployments can bootstrap with a single paste into the Supabase SQL Editor. Add scripts/setup.sh to automate dependency install, .env creation, and prerequisite validation. Update README and ARCHITECTURE.md to reflect the streamlined setup flow. Co-Authored-By: Claude Opus 4.6 --- ARCHITECTURE.md | 4 +- README.md | 53 +++---- scripts/setup.sh | 128 ++++++++++++++++ supabase/schema.sql | 353 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 504 insertions(+), 34 deletions(-) create mode 100755 scripts/setup.sh create mode 100644 supabase/schema.sql diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index fac83b1..caae5fd 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -270,6 +270,8 @@ The 12 migrations tell the story of iterative feature development, not a pre-pla **What this progression demonstrates:** The schema evolved with the product. Search started as text-only (002), added semantic vectors (003), then face vectors were accelerated with HNSW (006). Data integrity was tightened retroactively (005, 008) as pipeline re-runs exposed edge cases. The sentinel pattern (009, 012) was iterated and refined during testing. Each migration solves a specific problem encountered during development — not a theoretical schema design exercise. +**Developer experience:** For new deployments, `supabase/schema.sql` consolidates all 12 migrations into a single file — one paste into the Supabase SQL Editor creates the full schema. The individual migrations remain for incremental updates on existing databases. A setup script (`scripts/setup.sh`) automates dependency installation, `.env` creation, and prerequisite validation. + ## 5. Pipeline Design ### 5.1 Origin: Python to TypeScript @@ -506,7 +508,7 @@ The goal is not multi-tenant SaaS but a **portable, customizable tool** that any ### 9.2 Minimal Changes for Portability -1. **Setup wizard** — a first-run experience that collects Drive folder ID, API keys, and event branding. Currently these are environment variables; a wizard would write them to Vercel env vars or a config file. +1. **Setup wizard** — a first-run experience that collects Drive folder ID, API keys, and event branding. The current `scripts/setup.sh` handles local setup; the next step is a web-based wizard that writes directly to Vercel env vars or a config file. 2. **Auth improvement** — replace the prototype cookie with Supabase Auth or a simple hashed-password check. The organizer sets a password during setup. 3. **Dynamic theming** — extract the current hardcoded color scheme (matrix green/magenta) into CSS custom properties or a theme config. The organizer picks colors during setup. 4. **Face matching as optional** — already partially implemented (face-embed phase skips if `FACE_API_URL` is unset). The UI should gracefully hide face matching features when the service isn't configured. diff --git a/README.md b/README.md index 9e2a42d..404946b 100644 --- a/README.md +++ b/README.md @@ -97,47 +97,30 @@ Next.js 15 App (Vercel) ## Setup -### Prerequisites - -- Node.js 18+ -- Google Cloud project with **Drive API** enabled + API key -- Google Drive folder with event photos (shared as "Anyone with the link can view") -- Supabase project with pgvector extension enabled -- Gemini API key (from [aistudio.google.com](https://aistudio.google.com)) -- (Optional) InsightFace microservice deployed for face matching - -### 1. Clone and install +### Quick Start ```bash git clone cd eventlens -npm install +./scripts/setup.sh ``` -### 2. Set up Supabase +The setup script checks prerequisites, installs dependencies, creates your `.env` file, validates required keys, and prints next steps. -Create a Supabase project, then run the 12 migrations in order via the SQL Editor: +### Database -``` -supabase/migrations/001_match_faces.sql -supabase/migrations/002_search_photos.sql -supabase/migrations/003_description_embeddings.sql -supabase/migrations/004_match_sessions.sql -supabase/migrations/005_add_face_embedding_unique_constraint.sql -supabase/migrations/006_add_face_embedding_hnsw_index.sql -supabase/migrations/007_match_session_analytics.sql -supabase/migrations/008_drive_file_id_canonical.sql -supabase/migrations/009_sentinel_face_embedding_guard.sql -supabase/migrations/010_phash_dedup.sql -supabase/migrations/011_auto_tags.sql -supabase/migrations/012_allow_null_face_embedding.sql +Create a [Supabase](https://supabase.com) project with the pgvector extension enabled, then run the bootstrap schema: + +```sql +-- Paste into Supabase SQL Editor → Run +-- File: supabase/schema.sql ``` -These create the `photos` and `face_embeddings` tables, pgvector indexes (HNSW for face embeddings), full-text search with tsvector, RPC functions for similarity search, and match session analytics. +This single file creates all tables, indexes, and RPC functions. It's the consolidated equivalent of the 12 incremental migrations in `supabase/migrations/` — use those if you need to apply changes to an existing deployment. -### 3. Deploy the face-api service (optional) +### Face Matching (Optional) -The InsightFace microservice lives in `services/face-api/`. It's a Flask app that accepts base64 images and returns face embeddings + bounding boxes. Deploy to Railway, Render, or Fly.io: +The InsightFace microservice lives in `services/face-api/`. Deploy to Railway, Render, or Fly.io: ```bash cd services/face-api @@ -145,12 +128,12 @@ docker build -t face-api . # Deploy to your platform of choice ``` -The service uses InsightFace's `buffalo_l` model (512-dim embeddings). It needs ~2GB RAM and takes 30-60s to cold start while downloading the model. +Uses InsightFace's `buffalo_l` model (512-dim embeddings). Needs ~2GB RAM, 30-60s cold start for model download. -### 4. Configure environment +### Environment ```bash -cp .env.example .env.local +cp .env.example .env ``` Fill in the values: @@ -317,7 +300,11 @@ services/face-api/ # InsightFace microservice (Flask + Doc │ ├── Dockerfile │ └── requirements.txt │ -supabase/migrations/ # 12 SQL migrations +scripts/setup.sh # Guided local setup +│ +supabase/ +│ ├── schema.sql # Bootstrap: full schema in one file +│ └── migrations/ # 12 incremental migrations ``` --- diff --git a/scripts/setup.sh b/scripts/setup.sh new file mode 100755 index 0000000..88ac31b --- /dev/null +++ b/scripts/setup.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash +# EventLens — Local Development Setup +# Usage: ./scripts/setup.sh +# +# This script: +# 1. Checks prerequisites (Node.js, npm) +# 2. Installs dependencies +# 3. Creates .env from .env.example (if needed) +# 4. Validates required environment variables +# 5. Prints next steps (Supabase schema, optional face-api) +# +# @TheTechMargin 2026 + +set -euo pipefail + +# ── Colors ────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' # No Color + +info() { echo -e "${CYAN}ℹ${NC} $1"; } +success() { echo -e "${GREEN}✓${NC} $1"; } +warn() { echo -e "${YELLOW}⚠${NC} $1"; } +fail() { echo -e "${RED}✗${NC} $1"; } + +echo "" +echo -e "${CYAN}╔══════════════════════════════════════╗${NC}" +echo -e "${CYAN}║ EventLens — Setup Script ║${NC}" +echo -e "${CYAN}╚══════════════════════════════════════╝${NC}" +echo "" + +# ── Step 1: Prerequisites ────────────────────────────────────── +info "Checking prerequisites..." + +if ! command -v node &> /dev/null; then + fail "Node.js is not installed. Install it from https://nodejs.org (v18+)" + exit 1 +fi + +NODE_VERSION=$(node -v | sed 's/v//' | cut -d. -f1) +if [ "$NODE_VERSION" -lt 18 ]; then + fail "Node.js v18+ required (found v$(node -v))" + exit 1 +fi +success "Node.js $(node -v)" + +if ! command -v npm &> /dev/null; then + fail "npm is not installed" + exit 1 +fi +success "npm $(npm -v)" + +# ── Step 2: Install dependencies ─────────────────────────────── +info "Installing dependencies..." +npm install --silent +success "Dependencies installed" + +# ── Step 3: Environment file ─────────────────────────────────── +if [ ! -f .env ]; then + if [ -f .env.example ]; then + cp .env.example .env + warn "Created .env from .env.example — fill in your keys before running" + else + fail ".env.example not found" + exit 1 + fi +else + success ".env already exists" +fi + +# ── Step 4: Validate required env vars ───────────────────────── +info "Checking environment variables..." + +MISSING=() + +check_var() { + local val + val=$(grep "^$1=" .env 2>/dev/null | cut -d= -f2-) + if [ -z "$val" ]; then + MISSING+=("$1") + fi +} + +check_var "GOOGLE_API_KEY" +check_var "GOOGLE_DRIVE_FOLDER_ID" +check_var "GEMINI_API_KEY" +check_var "NEXT_PUBLIC_SUPABASE_URL" +check_var "SUPABASE_SERVICE_ROLE_KEY" +check_var "APP_PASSWORD" +check_var "ADMIN_API_SECRET" + +if [ ${#MISSING[@]} -gt 0 ]; then + warn "Missing environment variables in .env:" + for var in "${MISSING[@]}"; do + echo -e " ${YELLOW}→${NC} $var" + done + echo "" +else + success "All required environment variables set" +fi + +# ── Step 5: Next steps ───────────────────────────────────────── +echo "" +echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo -e "${CYAN} Next Steps${NC}" +echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}" +echo "" +echo " 1. Fill in any missing .env values" +echo "" +echo " 2. Set up your Supabase database:" +echo " → Go to your Supabase project SQL Editor" +echo " → Paste and run: supabase/schema.sql" +echo " (This creates all tables, indexes, and functions)" +echo "" +echo " 3. Start the dev server:" +echo " npm run dev" +echo "" +echo " 4. Open the admin panel to run the pipeline:" +echo " http://localhost:3000/admin" +echo "" +echo -e " ${YELLOW}Optional:${NC} Face matching" +echo " → Deploy services/face-api/ to Railway/Render" +echo " → Set FACE_API_URL and FACE_API_SECRET in .env" +echo "" +echo -e "${GREEN}Setup complete!${NC}" +echo "" diff --git a/supabase/schema.sql b/supabase/schema.sql new file mode 100644 index 0000000..b383ee3 --- /dev/null +++ b/supabase/schema.sql @@ -0,0 +1,353 @@ +-- EventLens: Complete Database Schema +-- Run this file once to bootstrap a fresh Supabase project. +-- For incremental updates, use the numbered migrations in ./migrations/ +-- +-- Prerequisites: +-- 1. Create a Supabase project at https://supabase.com +-- 2. Enable the pgvector extension (Database → Extensions → vector) +-- 3. Run this file in the SQL Editor +-- +-- @TheTechMargin 2026 + +-- ============================================================ +-- Extensions +-- ============================================================ +create extension if not exists vector; -- pgvector for embeddings +create extension if not exists pg_trgm; -- trigram fuzzy text matching + +-- ============================================================ +-- Core Tables +-- ============================================================ + +-- Photos: one row per Google Drive image +create table if not exists photos ( + id uuid default gen_random_uuid() primary key, + drive_file_id text not null unique, -- stable Google Drive ID + filename text, + drive_url text, + folder text, + visible_text text, -- OCR / text in image + people_descriptions text, -- Gemini-generated people descriptions + scene_description text, -- Gemini-generated scene description + face_count int, + status text default 'pending', -- pending | completed | error + error_message text, + processed_at timestamptz, + created_at timestamptz default now(), + + -- Semantic search: 768-dim Gemini text-embedding-004 + description_embedding vector(768), + + -- Perceptual hash for near-duplicate detection + phash bigint, + + -- Soft-delete for duplicate hiding + hidden boolean default false, + + -- Auto-generated thematic tag + auto_tag text, + + -- Full-text search (generated column) + search_vector tsvector generated always as ( + to_tsvector('english', + coalesce(visible_text, '') || ' ' || + coalesce(people_descriptions, '') || ' ' || + coalesce(scene_description, '') || ' ' || + coalesce(filename, '') || ' ' || + coalesce(folder, '') + ) + ) stored +); + +-- Face embeddings: one row per detected face (or sentinel for faceless photos) +create table if not exists face_embeddings ( + id uuid default gen_random_uuid() primary key, + drive_file_id text not null references photos(drive_file_id) on delete cascade, + filename text, + folder text, + face_index int, -- -1 = sentinel (no faces detected) + embedding vector(512), -- NULL for sentinel rows + bbox_x1 float8, + bbox_y1 float8, + bbox_x2 float8, + bbox_y2 float8, + created_at timestamptz default now(), + + constraint face_embeddings_drive_file_id_face_index_key + unique (drive_file_id, face_index) +); + +-- Match sessions: analytics for every face-search query (no PII stored) +create table if not exists match_sessions ( + id uuid default gen_random_uuid() primary key, + created_at timestamptz default now() not null, + tier text not null, -- vector | text | visual | both + match_count int not null default 0, + top_confidence int, + query_embedding vector(512), -- face embedding (not the selfie) + matched_photo_ids text[] default '{}' +); + +-- ============================================================ +-- Indexes +-- ============================================================ + +-- Photos +create index if not exists idx_photos_search_vector + on photos using gin(search_vector); +create index if not exists idx_photos_visible_text_trgm + on photos using gin(visible_text gin_trgm_ops); +create index if not exists idx_photos_people_desc_trgm + on photos using gin(people_descriptions gin_trgm_ops); +create index if not exists idx_photos_scene_desc_trgm + on photos using gin(scene_description gin_trgm_ops); +create index if not exists idx_photos_description_embedding + on photos using hnsw (description_embedding vector_cosine_ops) + with (m = 16, ef_construction = 64); +create index if not exists idx_photos_phash + on photos (phash) where phash is not null; +create index if not exists idx_photos_hidden + on photos (hidden) where hidden = true; +create index if not exists idx_photos_auto_tag + on photos (auto_tag) where auto_tag is not null; + +-- Face embeddings (partial index — excludes NULL sentinel embeddings) +create index if not exists idx_face_embeddings_hnsw + on face_embeddings using hnsw (embedding vector_cosine_ops) + with (m = 16, ef_construction = 64) + where embedding is not null; + +-- Match sessions +create index if not exists idx_match_sessions_created + on match_sessions (created_at desc); +create index if not exists idx_match_sessions_photo_ids + on match_sessions using gin (matched_photo_ids); + +-- ============================================================ +-- RPC Functions +-- ============================================================ + +-- Face matching: cosine similarity over 512-dim InsightFace embeddings +create or replace function match_faces( + query_embedding vector(512), + match_threshold float default 0.6, + match_count int default 20 +) +returns table ( + drive_file_id text, + filename text, + folder text, + face_index int, + similarity float, + bbox_x1 float8, + bbox_y1 float8, + bbox_x2 float8, + bbox_y2 float8 +) +language plpgsql as $$ +begin + return query + select + fe.drive_file_id, fe.filename, fe.folder, fe.face_index, + 1 - (fe.embedding <=> query_embedding) as similarity, + fe.bbox_x1, fe.bbox_y1, fe.bbox_x2, fe.bbox_y2 + from face_embeddings fe + where fe.embedding is not null + and 1 - (fe.embedding <=> query_embedding) > match_threshold + order by fe.embedding <=> query_embedding + limit match_count; +end; +$$; + +-- Hybrid text search: full-text (ts_rank) + trigram fuzzy matching +create or replace function search_photos( + query_text text, + result_limit int default 50 +) +returns table ( + id uuid, drive_file_id text, filename text, drive_url text, folder text, + visible_text text, people_descriptions text, scene_description text, + face_count int, processed_at timestamptz, created_at timestamptz, rank float +) +language plpgsql as $$ +declare + tsquery_val tsquery; +begin + tsquery_val := plainto_tsquery('english', query_text); + return query + select + p.id, p.drive_file_id, p.filename, p.drive_url, p.folder, + p.visible_text, p.people_descriptions, p.scene_description, + p.face_count, p.processed_at, p.created_at, + ( + coalesce(ts_rank_cd(p.search_vector, tsquery_val, 32), 0) * 10 + + greatest( + similarity(p.visible_text, query_text), + similarity(p.people_descriptions, query_text), + similarity(p.scene_description, query_text), + similarity(p.filename, query_text) + ) * 5 + )::float as rank + from photos p + where p.status = 'completed' + and (p.hidden is not true) + and ( + p.search_vector @@ tsquery_val + or similarity(p.visible_text, query_text) > 0.1 + or similarity(p.people_descriptions, query_text) > 0.1 + or similarity(p.scene_description, query_text) > 0.1 + or similarity(p.filename, query_text) > 0.15 + or p.visible_text ilike '%' || query_text || '%' + or p.people_descriptions ilike '%' || query_text || '%' + or p.scene_description ilike '%' || query_text || '%' + ) + order by rank desc + limit result_limit; +end; +$$; + +-- Semantic search: cosine similarity over 768-dim Gemini embeddings +create or replace function search_photos_semantic( + query_embedding vector(768), + match_threshold float default 0.5, + match_count int default 30 +) +returns table ( + id uuid, drive_file_id text, filename text, drive_url text, folder text, + visible_text text, people_descriptions text, scene_description text, + face_count int, similarity float +) +language plpgsql as $$ +begin + return query + select + p.id, p.drive_file_id, p.filename, p.drive_url, p.folder, + p.visible_text, p.people_descriptions, p.scene_description, + p.face_count, + 1 - (p.description_embedding <=> query_embedding) as similarity + from photos p + where p.description_embedding is not null + and p.status = 'completed' + and (p.hidden is not true) + and 1 - (p.description_embedding <=> query_embedding) > match_threshold + order by p.description_embedding <=> query_embedding + limit match_count; +end; +$$; + +-- Analytics: recent match activity +create or replace function get_recent_match_activity( + hours_back int default 24, + max_results int default 20 +) +returns table ( + id uuid, created_at timestamptz, tier text, match_count int, top_confidence int +) +language sql stable as $$ + select ms.id, ms.created_at, ms.tier, ms.match_count, ms.top_confidence + from match_sessions ms + where ms.created_at > now() - make_interval(hours => hours_back) + order by ms.created_at desc + limit max_results; +$$; + +-- Analytics: most-matched photos +create or replace function get_hot_photo_ids( + top_n int default 10, + hours_back int default 168 +) +returns table (photo_id text, match_hit_count bigint) +language sql stable as $$ + select pid as photo_id, count(*) as match_hit_count + from match_sessions ms, unnest(ms.matched_photo_ids) as pid + where ms.created_at > now() - make_interval(hours => hours_back) + and ms.match_count > 0 + group by pid + order by match_hit_count desc + limit top_n; +$$; + +-- Analytics: unique face-search users +create or replace function get_unique_operatives_count() +returns bigint +language sql stable as $$ + select count(*) from match_sessions where query_embedding is not null; +$$; + +-- Analytics: find similar past sessions by face embedding +create or replace function find_similar_sessions( + probe_embedding vector(512), + threshold float default 0.7, + max_results int default 5 +) +returns table ( + id uuid, created_at timestamptz, tier text, match_count int, + top_confidence int, matched_photo_ids text[], similarity float +) +language sql stable as $$ + select + ms.id, ms.created_at, ms.tier, ms.match_count, ms.top_confidence, + ms.matched_photo_ids, + 1 - (ms.query_embedding <=> probe_embedding) as similarity + from match_sessions ms + where ms.query_embedding is not null + and 1 - (ms.query_embedding <=> probe_embedding) > threshold + and ms.match_count > 0 + order by ms.query_embedding <=> probe_embedding + limit max_results; +$$; + +-- Analytics: co-occurrence recommendations +create or replace function get_cooccurrence_recommendations( + user_photo_ids text[], + exclude_photo_ids text[] default '{}', + max_results int default 8 +) +returns table (photo_id text, cooccurrence_count bigint) +language sql stable as $$ + select pid as photo_id, count(*) as cooccurrence_count + from match_sessions ms, unnest(ms.matched_photo_ids) as pid + where ms.matched_photo_ids && user_photo_ids + and pid != all(user_photo_ids) + and pid != all(exclude_photo_ids) + and ms.match_count > 0 + group by pid + order by cooccurrence_count desc + limit max_results; +$$; + +-- Dedup: find near-duplicate clusters by perceptual hash Hamming distance +create or replace function find_duplicate_clusters( + hamming_threshold int default 10 +) +returns table ( + group_id bigint, photo_id uuid, drive_file_id text, filename text, + folder text, phash bigint, hamming_distance int, hidden boolean +) +language sql as $$ + with pairs as ( + select + a.id as id_a, b.id as id_b, + a.phash as phash_a, b.phash as phash_b, + a.drive_file_id as drive_file_id_a, a.filename as filename_a, + a.folder as folder_a, a.hidden as hidden_a, + b.drive_file_id as drive_file_id_b, b.filename as filename_b, + b.folder as folder_b, b.hidden as hidden_b, + bit_count((a.phash # b.phash)::bit(64))::int as dist + from photos a + join photos b on a.id < b.id + where a.phash is not null and b.phash is not null + and a.status = 'completed' and b.status = 'completed' + and bit_count((a.phash # b.phash)::bit(64))::int <= hamming_threshold + ) + select dense_rank() over (order by least(id_a, id_b)) as group_id, + id_a as photo_id, drive_file_id_a as drive_file_id, filename_a as filename, + folder_a as folder, phash_a as phash, dist as hamming_distance, hidden_a as hidden + from pairs + union all + select dense_rank() over (order by least(id_a, id_b)) as group_id, + id_b as photo_id, drive_file_id_b as drive_file_id, filename_b as filename, + folder_b as folder, phash_b as phash, dist as hamming_distance, hidden_b as hidden + from pairs + order by group_id, hamming_distance; +$$;