From 8f9a63aca047d096493215b9d67c7469cadd1441 Mon Sep 17 00:00:00 2001
From: sonia <sonia@thetechmargin.com>
Date: Sun, 15 Mar 2026 15:10:52 -0400
Subject: [PATCH] Add bootstrap schema and setup script for single-command
 onboarding

Consolidate 12 incremental SQL migrations into supabase/schema.sql
so new deployments can bootstrap with a single paste into the Supabase
SQL Editor. Add scripts/setup.sh to automate dependency install, .env
creation, and prerequisite validation. Update README and ARCHITECTURE.md
to reflect the streamlined setup flow.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ARCHITECTURE.md     |   4 +-
 README.md           |  53 +++----
 scripts/setup.sh    | 128 ++++++++++++++++
 supabase/schema.sql | 353 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 504 insertions(+), 34 deletions(-)
 create mode 100755 scripts/setup.sh
 create mode 100644 supabase/schema.sql
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index fac83b1..caae5fd 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -270,6 +270,8 @@ The 12 migrations tell the story of iterative feature development, not a pre-pla
 
 **What this progression demonstrates:** The schema evolved with the product. Search started as text-only (002), added semantic vectors (003), then face vectors were accelerated with HNSW (006). Data integrity was tightened retroactively (005, 008) as pipeline re-runs exposed edge cases. The sentinel pattern (009, 012) was iterated and refined during testing. Each migration solves a specific problem encountered during development — not a theoretical schema design exercise.
 
+**Developer experience:** For new deployments, `supabase/schema.sql` consolidates all 12 migrations into a single file — one paste into the Supabase SQL Editor creates the full schema. The individual migrations remain for incremental updates on existing databases. A setup script (`scripts/setup.sh`) automates dependency installation, `.env` creation, and prerequisite validation.
+
 ## 5. Pipeline Design
 
 ### 5.1 Origin: Python to TypeScript
@@ -506,7 +508,7 @@ The goal is not multi-tenant SaaS but a **portable, customizable tool** that any
 
 ### 9.2 Minimal Changes for Portability
 
-1. **Setup wizard** — a first-run experience that collects Drive folder ID, API keys, and event branding. Currently these are environment variables; a wizard would write them to Vercel env vars or a config file.
+1. **Setup wizard** — a first-run experience that collects Drive folder ID, API keys, and event branding. The current `scripts/setup.sh` handles local setup; the next step is a web-based wizard that writes directly to Vercel env vars or a config file.
 2. **Auth improvement** — replace the prototype cookie with Supabase Auth or a simple hashed-password check. The organizer sets a password during setup.
 3. **Dynamic theming** — extract the current hardcoded color scheme (matrix green/magenta) into CSS custom properties or a theme config. The organizer picks colors during setup.
 4. **Face matching as optional** — already partially implemented (face-embed phase skips if `FACE_API_URL` is unset). The UI should gracefully hide face matching features when the service isn't configured.
diff --git a/README.md b/README.md
index 9e2a42d..404946b 100644
--- a/README.md
+++ b/README.md
@@ -97,47 +97,30 @@ Next.js 15 App (Vercel)
 
 ## Setup
 
-### Prerequisites
-
-- Node.js 18+
-- Google Cloud project with **Drive API** enabled + API key
-- Google Drive folder with event photos (shared as "Anyone with the link can view")
-- Supabase project with pgvector extension enabled
-- Gemini API key (from [aistudio.google.com](https://aistudio.google.com))
-- (Optional) InsightFace microservice deployed for face matching
-
-### 1. Clone and install
+### Quick Start
 
 ```bash
 git clone <repo-url>
 cd eventlens
-npm install
+./scripts/setup.sh
 ```
 
-### 2. Set up Supabase
+The setup script checks prerequisites, installs dependencies, creates your `.env` file, validates required keys, and prints next steps.
 
-Create a Supabase project, then run the 12 migrations in order via the SQL Editor:
+### Database
 
-```
-supabase/migrations/001_match_faces.sql
-supabase/migrations/002_search_photos.sql
-supabase/migrations/003_description_embeddings.sql
-supabase/migrations/004_match_sessions.sql
-supabase/migrations/005_add_face_embedding_unique_constraint.sql
-supabase/migrations/006_add_face_embedding_hnsw_index.sql
-supabase/migrations/007_match_session_analytics.sql
-supabase/migrations/008_drive_file_id_canonical.sql
-supabase/migrations/009_sentinel_face_embedding_guard.sql
-supabase/migrations/010_phash_dedup.sql
-supabase/migrations/011_auto_tags.sql
-supabase/migrations/012_allow_null_face_embedding.sql
+Create a [Supabase](https://supabase.com) project with the pgvector extension enabled, then run the bootstrap schema:
+
+```sql
+-- Paste into Supabase SQL Editor → Run
+-- File: supabase/schema.sql
 ```
 
-These create the `photos` and `face_embeddings` tables, pgvector indexes (HNSW for face embeddings), full-text search with tsvector, RPC functions for similarity search, and match session analytics.
+This single file creates all tables, indexes, and RPC functions. It's the consolidated equivalent of the 12 incremental migrations in `supabase/migrations/` — use those if you need to apply changes to an existing deployment.
 
-### 3. Deploy the face-api service (optional)
+### Face Matching (Optional)
 
-The InsightFace microservice lives in `services/face-api/`. It's a Flask app that accepts base64 images and returns face embeddings + bounding boxes. Deploy to Railway, Render, or Fly.io:
+The InsightFace microservice lives in `services/face-api/`. Deploy to Railway, Render, or Fly.io:
 
 ```bash
 cd services/face-api
@@ -145,12 +128,12 @@ docker build -t face-api .
 # Deploy to your platform of choice
 ```
 
-The service uses InsightFace's `buffalo_l` model (512-dim embeddings). It needs ~2GB RAM and takes 30-60s to cold start while downloading the model.
+Uses InsightFace's `buffalo_l` model (512-dim embeddings). Needs ~2GB RAM, 30-60s cold start for model download.
 
-### 4. Configure environment
+### Environment
 
 ```bash
-cp .env.example .env.local
+cp .env.example .env
 ```
 
 Fill in the values:
@@ -317,7 +300,11 @@ services/face-api/                       # InsightFace microservice (Flask + Doc
 │   ├── Dockerfile
 │   └── requirements.txt
 │
-supabase/migrations/                     # 12 SQL migrations
+scripts/setup.sh                         # Guided local setup
+│
+supabase/
+│   ├── schema.sql                       # Bootstrap: full schema in one file
+│   └── migrations/                      # 12 incremental migrations
 ```
 
 ---
diff --git a/scripts/setup.sh b/scripts/setup.sh
new file mode 100755
index 0000000..88ac31b
--- /dev/null
+++ b/scripts/setup.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+# EventLens — Local Development Setup
+# Usage: ./scripts/setup.sh
+#
+# This script:
+#   1. Checks prerequisites (Node.js, npm)
+#   2. Installs dependencies
+#   3. Creates .env from .env.example (if needed)
+#   4. Validates required environment variables
+#   5. Prints next steps (Supabase schema, optional face-api)
+#
+# @TheTechMargin 2026
+
+set -euo pipefail
+
+# ── Colors ──────────────────────────────────────────────────────
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+CYAN='\033[0;36m'
+NC='\033[0m' # No Color
+
+info()    { echo -e "${CYAN}ℹ${NC}  $1"; }
+success() { echo -e "${GREEN}✓${NC}  $1"; }
+warn()    { echo -e "${YELLOW}⚠${NC}  $1"; }
+fail()    { echo -e "${RED}✗${NC}  $1"; }
+
+echo ""
+echo -e "${CYAN}╔══════════════════════════════════════╗${NC}"
+echo -e "${CYAN}║       EventLens — Setup Script       ║${NC}"
+echo -e "${CYAN}╚══════════════════════════════════════╝${NC}"
+echo ""
+
+# ── Step 1: Prerequisites ──────────────────────────────────────
+info "Checking prerequisites..."
+
+if ! command -v node &> /dev/null; then
+  fail "Node.js is not installed. Install it from https://nodejs.org (v18+)"
+  exit 1
+fi
+
+NODE_VERSION=$(node -v | sed 's/v//' | cut -d. -f1)
+if [ "$NODE_VERSION" -lt 18 ]; then
+  fail "Node.js v18+ required (found v$(node -v))"
+  exit 1
+fi
+success "Node.js $(node -v)"
+
+if ! command -v npm &> /dev/null; then
+  fail "npm is not installed"
+  exit 1
+fi
+success "npm $(npm -v)"
+
+# ── Step 2: Install dependencies ───────────────────────────────
+info "Installing dependencies..."
+npm install --silent
+success "Dependencies installed"
+
+# ── Step 3: Environment file ───────────────────────────────────
+if [ ! -f .env ]; then
+  if [ -f .env.example ]; then
+    cp .env.example .env
+    warn "Created .env from .env.example — fill in your keys before running"
+  else
+    fail ".env.example not found"
+    exit 1
+  fi
+else
+  success ".env already exists"
+fi
+
+# ── Step 4: Validate required env vars ─────────────────────────
+info "Checking environment variables..."
+
+MISSING=()
+
+check_var() {
+  local val
+  val=$(grep "^$1=" .env 2>/dev/null | cut -d= -f2-)
+  if [ -z "$val" ]; then
+    MISSING+=("$1")
+  fi
+}
+
+check_var "GOOGLE_API_KEY"
+check_var "GOOGLE_DRIVE_FOLDER_ID"
+check_var "GEMINI_API_KEY"
+check_var "NEXT_PUBLIC_SUPABASE_URL"
+check_var "SUPABASE_SERVICE_ROLE_KEY"
+check_var "APP_PASSWORD"
+check_var "ADMIN_API_SECRET"
+
+if [ ${#MISSING[@]} -gt 0 ]; then
+  warn "Missing environment variables in .env:"
+  for var in "${MISSING[@]}"; do
+    echo -e "     ${YELLOW}→${NC} $var"
+  done
+  echo ""
+else
+  success "All required environment variables set"
+fi
+
+# ── Step 5: Next steps ─────────────────────────────────────────
+echo ""
+echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo -e "${CYAN}  Next Steps${NC}"
+echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"
+echo ""
+echo "  1. Fill in any missing .env values"
+echo ""
+echo "  2. Set up your Supabase database:"
+echo "     → Go to your Supabase project SQL Editor"
+echo "     → Paste and run: supabase/schema.sql"
+echo "     (This creates all tables, indexes, and functions)"
+echo ""
+echo "  3. Start the dev server:"
+echo "     npm run dev"
+echo ""
+echo "  4. Open the admin panel to run the pipeline:"
+echo "     http://localhost:3000/admin"
+echo ""
+echo -e "  ${YELLOW}Optional:${NC} Face matching"
+echo "     → Deploy services/face-api/ to Railway/Render"
+echo "     → Set FACE_API_URL and FACE_API_SECRET in .env"
+echo ""
+echo -e "${GREEN}Setup complete!${NC}"
+echo ""
diff --git a/supabase/schema.sql b/supabase/schema.sql
new file mode 100644
index 0000000..b383ee3
--- /dev/null
+++ b/supabase/schema.sql
@@ -0,0 +1,353 @@
+-- EventLens: Complete Database Schema
+-- Run this file once to bootstrap a fresh Supabase project.
+-- For incremental updates, use the numbered migrations in ./migrations/
+--
+-- Prerequisites:
+--   1. Create a Supabase project at https://supabase.com
+--   2. Enable the pgvector extension (Database → Extensions → vector)
+--   3. Run this file in the SQL Editor
+--
+-- @TheTechMargin 2026
+
+-- ============================================================
+-- Extensions
+-- ============================================================
+create extension if not exists vector;    -- pgvector for embeddings
+create extension if not exists pg_trgm;   -- trigram fuzzy text matching
+
+-- ============================================================
+-- Core Tables
+-- ============================================================
+
+-- Photos: one row per Google Drive image
+create table if not exists photos (
+  id                    uuid default gen_random_uuid() primary key,
+  drive_file_id         text not null unique,       -- stable Google Drive ID
+  filename              text,
+  drive_url             text,
+  folder                text,
+  visible_text          text,                        -- OCR / text in image
+  people_descriptions   text,                        -- Gemini-generated people descriptions
+  scene_description     text,                        -- Gemini-generated scene description
+  face_count            int,
+  status                text default 'pending',      -- pending | completed | error
+  error_message         text,
+  processed_at          timestamptz,
+  created_at            timestamptz default now(),
+
+  -- Semantic search: 768-dim Gemini text-embedding-004
+  description_embedding vector(768),
+
+  -- Perceptual hash for near-duplicate detection
+  phash                 bigint,
+
+  -- Soft-delete for duplicate hiding
+  hidden                boolean default false,
+
+  -- Auto-generated thematic tag
+  auto_tag              text,
+
+  -- Full-text search (generated column)
+  search_vector         tsvector generated always as (
+    to_tsvector('english',
+      coalesce(visible_text, '') || ' ' ||
+      coalesce(people_descriptions, '') || ' ' ||
+      coalesce(scene_description, '') || ' ' ||
+      coalesce(filename, '') || ' ' ||
+      coalesce(folder, '')
+    )
+  ) stored
+);
+
+-- Face embeddings: one row per detected face (or sentinel for faceless photos)
+create table if not exists face_embeddings (
+  id            uuid default gen_random_uuid() primary key,
+  drive_file_id text not null references photos(drive_file_id) on delete cascade,
+  filename      text,
+  folder        text,
+  face_index    int,                      -- -1 = sentinel (no faces detected)
+  embedding     vector(512),              -- NULL for sentinel rows
+  bbox_x1       float8,
+  bbox_y1       float8,
+  bbox_x2       float8,
+  bbox_y2       float8,
+  created_at    timestamptz default now(),
+
+  constraint face_embeddings_drive_file_id_face_index_key
+    unique (drive_file_id, face_index)
+);
+
+-- Match sessions: analytics for every face-search query (no PII stored)
+create table if not exists match_sessions (
+  id                uuid default gen_random_uuid() primary key,
+  created_at        timestamptz default now() not null,
+  tier              text not null,                -- vector | text | visual | both
+  match_count       int not null default 0,
+  top_confidence    int,
+  query_embedding   vector(512),                  -- face embedding (not the selfie)
+  matched_photo_ids text[] default '{}'
+);
+
+-- ============================================================
+-- Indexes
+-- ============================================================
+
+-- Photos
+create index if not exists idx_photos_search_vector
+  on photos using gin(search_vector);
+create index if not exists idx_photos_visible_text_trgm
+  on photos using gin(visible_text gin_trgm_ops);
+create index if not exists idx_photos_people_desc_trgm
+  on photos using gin(people_descriptions gin_trgm_ops);
+create index if not exists idx_photos_scene_desc_trgm
+  on photos using gin(scene_description gin_trgm_ops);
+create index if not exists idx_photos_description_embedding
+  on photos using hnsw (description_embedding vector_cosine_ops)
+  with (m = 16, ef_construction = 64);
+create index if not exists idx_photos_phash
+  on photos (phash) where phash is not null;
+create index if not exists idx_photos_hidden
+  on photos (hidden) where hidden = true;
+create index if not exists idx_photos_auto_tag
+  on photos (auto_tag) where auto_tag is not null;
+
+-- Face embeddings (partial index — excludes NULL sentinel embeddings)
+create index if not exists idx_face_embeddings_hnsw
+  on face_embeddings using hnsw (embedding vector_cosine_ops)
+  with (m = 16, ef_construction = 64)
+  where embedding is not null;
+
+-- Match sessions
+create index if not exists idx_match_sessions_created
+  on match_sessions (created_at desc);
+create index if not exists idx_match_sessions_photo_ids
+  on match_sessions using gin (matched_photo_ids);
+
+-- ============================================================
+-- RPC Functions
+-- ============================================================
+
+-- Face matching: cosine similarity over 512-dim InsightFace embeddings
+create or replace function match_faces(
+  query_embedding vector(512),
+  match_threshold float default 0.6,
+  match_count int default 20
+)
+returns table (
+  drive_file_id text,
+  filename text,
+  folder text,
+  face_index int,
+  similarity float,
+  bbox_x1 float8,
+  bbox_y1 float8,
+  bbox_x2 float8,
+  bbox_y2 float8
+)
+language plpgsql as $$
+begin
+  return query
+    select
+      fe.drive_file_id, fe.filename, fe.folder, fe.face_index,
+      1 - (fe.embedding <=> query_embedding) as similarity,
+      fe.bbox_x1, fe.bbox_y1, fe.bbox_x2, fe.bbox_y2
+    from face_embeddings fe
+    where fe.embedding is not null
+      and 1 - (fe.embedding <=> query_embedding) > match_threshold
+    order by fe.embedding <=> query_embedding
+    limit match_count;
+end;
+$$;
+
+-- Hybrid text search: full-text (ts_rank) + trigram fuzzy matching
+create or replace function search_photos(
+  query_text text,
+  result_limit int default 50
+)
+returns table (
+  id uuid, drive_file_id text, filename text, drive_url text, folder text,
+  visible_text text, people_descriptions text, scene_description text,
+  face_count int, processed_at timestamptz, created_at timestamptz, rank float
+)
+language plpgsql as $$
+declare
+  tsquery_val tsquery;
+begin
+  tsquery_val := plainto_tsquery('english', query_text);
+  return query
+    select
+      p.id, p.drive_file_id, p.filename, p.drive_url, p.folder,
+      p.visible_text, p.people_descriptions, p.scene_description,
+      p.face_count, p.processed_at, p.created_at,
+      (
+        coalesce(ts_rank_cd(p.search_vector, tsquery_val, 32), 0) * 10 +
+        greatest(
+          similarity(p.visible_text, query_text),
+          similarity(p.people_descriptions, query_text),
+          similarity(p.scene_description, query_text),
+          similarity(p.filename, query_text)
+        ) * 5
+      )::float as rank
+    from photos p
+    where p.status = 'completed'
+      and (p.hidden is not true)
+      and (
+        p.search_vector @@ tsquery_val
+        or similarity(p.visible_text, query_text) > 0.1
+        or similarity(p.people_descriptions, query_text) > 0.1
+        or similarity(p.scene_description, query_text) > 0.1
+        or similarity(p.filename, query_text) > 0.15
+        or p.visible_text ilike '%' || query_text || '%'
+        or p.people_descriptions ilike '%' || query_text || '%'
+        or p.scene_description ilike '%' || query_text || '%'
+      )
+    order by rank desc
+    limit result_limit;
+end;
+$$;
+
+-- Semantic search: cosine similarity over 768-dim Gemini embeddings
+create or replace function search_photos_semantic(
+  query_embedding vector(768),
+  match_threshold float default 0.5,
+  match_count int default 30
+)
+returns table (
+  id uuid, drive_file_id text, filename text, drive_url text, folder text,
+  visible_text text, people_descriptions text, scene_description text,
+  face_count int, similarity float
+)
+language plpgsql as $$
+begin
+  return query
+    select
+      p.id, p.drive_file_id, p.filename, p.drive_url, p.folder,
+      p.visible_text, p.people_descriptions, p.scene_description,
+      p.face_count,
+      1 - (p.description_embedding <=> query_embedding) as similarity
+    from photos p
+    where p.description_embedding is not null
+      and p.status = 'completed'
+      and (p.hidden is not true)
+      and 1 - (p.description_embedding <=> query_embedding) > match_threshold
+    order by p.description_embedding <=> query_embedding
+    limit match_count;
+end;
+$$;
+
+-- Analytics: recent match activity
+create or replace function get_recent_match_activity(
+  hours_back int default 24,
+  max_results int default 20
+)
+returns table (
+  id uuid, created_at timestamptz, tier text, match_count int, top_confidence int
+)
+language sql stable as $$
+  select ms.id, ms.created_at, ms.tier, ms.match_count, ms.top_confidence
+  from match_sessions ms
+  where ms.created_at > now() - make_interval(hours => hours_back)
+  order by ms.created_at desc
+  limit max_results;
+$$;
+
+-- Analytics: most-matched photos
+create or replace function get_hot_photo_ids(
+  top_n int default 10,
+  hours_back int default 168
+)
+returns table (photo_id text, match_hit_count bigint)
+language sql stable as $$
+  select pid as photo_id, count(*) as match_hit_count
+  from match_sessions ms, unnest(ms.matched_photo_ids) as pid
+  where ms.created_at > now() - make_interval(hours => hours_back)
+    and ms.match_count > 0
+  group by pid
+  order by match_hit_count desc
+  limit top_n;
+$$;
+
+-- Analytics: unique face-search users
+create or replace function get_unique_operatives_count()
+returns bigint
+language sql stable as $$
+  select count(*) from match_sessions where query_embedding is not null;
+$$;
+
+-- Analytics: find similar past sessions by face embedding
+create or replace function find_similar_sessions(
+  probe_embedding vector(512),
+  threshold float default 0.7,
+  max_results int default 5
+)
+returns table (
+  id uuid, created_at timestamptz, tier text, match_count int,
+  top_confidence int, matched_photo_ids text[], similarity float
+)
+language sql stable as $$
+  select
+    ms.id, ms.created_at, ms.tier, ms.match_count, ms.top_confidence,
+    ms.matched_photo_ids,
+    1 - (ms.query_embedding <=> probe_embedding) as similarity
+  from match_sessions ms
+  where ms.query_embedding is not null
+    and 1 - (ms.query_embedding <=> probe_embedding) > threshold
+    and ms.match_count > 0
+  order by ms.query_embedding <=> probe_embedding
+  limit max_results;
+$$;
+
+-- Analytics: co-occurrence recommendations
+create or replace function get_cooccurrence_recommendations(
+  user_photo_ids text[],
+  exclude_photo_ids text[] default '{}',
+  max_results int default 8
+)
+returns table (photo_id text, cooccurrence_count bigint)
+language sql stable as $$
+  select pid as photo_id, count(*) as cooccurrence_count
+  from match_sessions ms, unnest(ms.matched_photo_ids) as pid
+  where ms.matched_photo_ids && user_photo_ids
+    and pid != all(user_photo_ids)
+    and pid != all(exclude_photo_ids)
+    and ms.match_count > 0
+  group by pid
+  order by cooccurrence_count desc
+  limit max_results;
+$$;
+
+-- Dedup: find near-duplicate clusters by perceptual hash Hamming distance
+create or replace function find_duplicate_clusters(
+  hamming_threshold int default 10
+)
+returns table (
+  group_id bigint, photo_id uuid, drive_file_id text, filename text,
+  folder text, phash bigint, hamming_distance int, hidden boolean
+)
+language sql as $$
+  with pairs as (
+    select
+      a.id as id_a, b.id as id_b,
+      a.phash as phash_a, b.phash as phash_b,
+      a.drive_file_id as drive_file_id_a, a.filename as filename_a,
+      a.folder as folder_a, a.hidden as hidden_a,
+      b.drive_file_id as drive_file_id_b, b.filename as filename_b,
+      b.folder as folder_b, b.hidden as hidden_b,
+      bit_count((a.phash # b.phash)::bit(64))::int as dist
+    from photos a
+    join photos b on a.id < b.id
+    where a.phash is not null and b.phash is not null
+      and a.status = 'completed' and b.status = 'completed'
+      and bit_count((a.phash # b.phash)::bit(64))::int <= hamming_threshold
+  )
+  select dense_rank() over (order by least(id_a, id_b)) as group_id,
+    id_a as photo_id, drive_file_id_a as drive_file_id, filename_a as filename,
+    folder_a as folder, phash_a as phash, dist as hamming_distance, hidden_a as hidden
+  from pairs
+  union all
+  select dense_rank() over (order by least(id_a, id_b)) as group_id,
+    id_b as photo_id, drive_file_id_b as drive_file_id, filename_b as filename,
+    folder_b as folder, phash_b as phash, dist as hamming_distance, hidden_b as hidden
+  from pairs
+  order by group_id, hamming_distance;
+$$;