Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ ARG CLAUDE_CODE_VERSION=2.1.128
ARG AGENT_BROWSER_VERSION=latest
ARG VERCEL_VERSION=52.2.1
ARG BUN_VERSION=1.3.12
ARG GMAIL_MCP_VERSION=1.1.11
ARG MCP_REMOTE_VERSION=0.1.29

# ---- System dependencies -----------------------------------------------------
# tini: correct PID 1 / signal forwarding so outbound.db writes finalize on
Expand All @@ -49,6 +51,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
ca-certificates \
curl \
git \
poppler-utils \
tini \
unzip \
&& if [ "$INSTALL_CJK_FONTS" = "true" ]; then \
Expand Down Expand Up @@ -110,6 +113,18 @@ RUN --mount=type=cache,target=/root/.cache/pnpm \
RUN --mount=type=cache,target=/root/.cache/pnpm \
pnpm install -g "@anthropic-ai/claude-code@${CLAUDE_CODE_VERSION}"

RUN --mount=type=cache,target=/root/.cache/pnpm \
pnpm install -g \
"@gongrzhe/server-gmail-autoauth-mcp@${GMAIL_MCP_VERSION}" \
"[email protected]"

RUN --mount=type=cache,target=/root/.cache/pnpm \
pnpm install -g "mcp-remote@${MCP_REMOTE_VERSION}"

# ---- pdf-reader CLI (fork addition) -------------------------------------------
COPY skills/pdf-reader/pdf-reader /usr/local/bin/pdf-reader
RUN chmod +x /usr/local/bin/pdf-reader

# ---- ncl CLI wrapper ----------------------------------------------------------
# Actual script lives in the mounted source at /app/src/cli/ncl.ts.
RUN printf '#!/bin/sh\nexec bun /app/src/cli/ncl.ts "$@"\n' > /usr/local/bin/ncl && \
Expand Down
94 changes: 94 additions & 0 deletions container/skills/pdf-reader/SKILL.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
---
name: pdf-reader
description: Read and extract text from PDF files — documents, reports, contracts, spreadsheets. Use whenever you need to read PDF content, not just when explicitly asked. Handles local files, URLs, and WhatsApp attachments.
allowed-tools: Bash(pdf-reader:*)
---

# PDF Reader

## Quick start

```bash
pdf-reader extract report.pdf # Extract all text
pdf-reader extract report.pdf --layout # Preserve tables/columns
pdf-reader fetch https://example.com/doc.pdf # Download and extract
pdf-reader info report.pdf # Show metadata + size
pdf-reader list # List all PDFs in directory tree
```

## Commands

### extract — Extract text from PDF

```bash
pdf-reader extract <file> # Full text to stdout
pdf-reader extract <file> --layout # Preserve layout (tables, columns)
pdf-reader extract <file> --pages 1-5 # Pages 1 through 5
pdf-reader extract <file> --pages 3-3 # Single page (page 3)
pdf-reader extract <file> --layout --pages 2-10 # Layout + page range
```

Options:
- `--layout` — Maintains spatial positioning. Essential for tables, spreadsheets, multi-column docs.
- `--pages N-M` — Extract only pages N through M (1-based, inclusive).

### fetch — Download and extract PDF from URL

```bash
pdf-reader fetch <url> # Download, verify, extract with layout
pdf-reader fetch <url> report.pdf # Also save a local copy
```

Downloads the PDF, verifies it has a valid `%PDF` header, then extracts text with layout preservation. Temporary files are cleaned up automatically.

### info — PDF metadata and file size

```bash
pdf-reader info <file>
```

Shows title, author, page count, page size, PDF version, and file size on disk.

### list — Find all PDFs in directory tree

```bash
pdf-reader list
```

Recursively lists all `.pdf` files with page count and file size.

## WhatsApp PDF attachments

When a user sends a PDF on WhatsApp, it is automatically saved to the `attachments/` directory. The message will include a path hint like:

> [PDF attached: attachments/document.pdf]

To read the attached PDF:

```bash
pdf-reader extract attachments/document.pdf --layout
```

## Example workflows

### Read a contract and summarize key terms

```bash
pdf-reader info attachments/contract.pdf
pdf-reader extract attachments/contract.pdf --layout
```

### Extract specific pages from a long report

```bash
pdf-reader info report.pdf # Check total pages
pdf-reader extract report.pdf --pages 1-3 # Executive summary
pdf-reader extract report.pdf --pages 15-20 # Financial tables
```

### Fetch and analyze a public document

```bash
pdf-reader fetch https://example.com/annual-report.pdf report.pdf
pdf-reader info report.pdf
```
203 changes: 203 additions & 0 deletions container/skills/pdf-reader/pdf-reader
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/bin/bash
set -euo pipefail

# pdf-reader — CLI wrapper around poppler-utils (pdftotext, pdfinfo)
# Provides extract, fetch, info, list commands for PDF processing.

VERSION="1.0.0"

usage() {
cat <<'USAGE'
pdf-reader — Extract text and metadata from PDF files

Usage:
pdf-reader extract <file> [--layout] [--pages N-M]
pdf-reader fetch <url> [filename]
pdf-reader info <file>
pdf-reader list
pdf-reader help

Commands:
extract Extract text from a PDF file to stdout
fetch Download a PDF from a URL and extract text
info Show PDF metadata and file size
list List all PDFs in current directory tree
help Show this help message

Extract options:
--layout Preserve original layout (tables, columns)
--pages Page range to extract (e.g. 1-5, 3-3 for single page)
USAGE
}

cmd_extract() {
local file=""
local layout=false
local first_page=""
local last_page=""

# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--layout)
layout=true
shift
;;
--pages)
if [[ -z "${2:-}" ]]; then
echo "Error: --pages requires a range argument (e.g. 1-5)" >&2
exit 1
fi
local range="$2"
first_page="${range%-*}"
last_page="${range#*-}"
shift 2
;;
-*)
echo "Error: Unknown option: $1" >&2
exit 1
;;
*)
if [[ -z "$file" ]]; then
file="$1"
else
echo "Error: Unexpected argument: $1" >&2
exit 1
fi
shift
;;
esac
done

if [[ -z "$file" ]]; then
echo "Error: No file specified" >&2
echo "Usage: pdf-reader extract <file> [--layout] [--pages N-M]" >&2
exit 1
fi

if [[ ! -f "$file" ]]; then
echo "Error: File not found: $file" >&2
exit 1
fi

# Build pdftotext arguments
local args=()
if [[ "$layout" == true ]]; then
args+=(-layout)
fi
if [[ -n "$first_page" ]]; then
args+=(-f "$first_page")
fi
if [[ -n "$last_page" ]]; then
args+=(-l "$last_page")
fi

pdftotext ${args[@]+"${args[@]}"} "$file" -
}

cmd_fetch() {
local url="${1:-}"
local filename="${2:-}"

if [[ -z "$url" ]]; then
echo "Error: No URL specified" >&2
echo "Usage: pdf-reader fetch <url> [filename]" >&2
exit 1
fi

# Create temporary file
local tmpfile
tmpfile="$(mktemp /tmp/pdf-reader-XXXXXX.pdf)"
trap 'rm -f "$tmpfile"' EXIT

# Download
echo "Downloading: $url" >&2
if ! curl -sL -o "$tmpfile" "$url"; then
echo "Error: Failed to download: $url" >&2
exit 1
fi

# Verify PDF header
local header
header="$(head -c 4 "$tmpfile")"
if [[ "$header" != "%PDF" ]]; then
echo "Error: Downloaded file is not a valid PDF (header: $header)" >&2
exit 1
fi

# Save with name if requested
if [[ -n "$filename" ]]; then
cp "$tmpfile" "$filename"
echo "Saved to: $filename" >&2
fi

# Extract with layout
pdftotext -layout "$tmpfile" -
}

cmd_info() {
local file="${1:-}"

if [[ -z "$file" ]]; then
echo "Error: No file specified" >&2
echo "Usage: pdf-reader info <file>" >&2
exit 1
fi

if [[ ! -f "$file" ]]; then
echo "Error: File not found: $file" >&2
exit 1
fi

pdfinfo "$file"
echo ""
echo "File size: $(du -h "$file" | cut -f1)"
}

cmd_list() {
local found=false

# Use globbing to find PDFs (globstar makes **/ match recursively)
shopt -s nullglob globstar

# Use associative array to deduplicate (*.pdf overlaps with **/*.pdf)
declare -A seen
for pdf in *.pdf **/*.pdf; do
[[ -v seen["$pdf"] ]] && continue
seen["$pdf"]=1
found=true

local pages="?"
local size
size="$(du -h "$pdf" | cut -f1)"

# Try to get page count from pdfinfo
if page_line="$(pdfinfo "$pdf" 2>/dev/null | grep '^Pages:')"; then
pages="$(echo "$page_line" | awk '{print $2}')"
fi

printf "%-60s %5s pages %8s\n" "$pdf" "$pages" "$size"
done

if [[ "$found" == false ]]; then
echo "No PDF files found in current directory tree." >&2
fi
}

# Main dispatch
command="${1:-help}"
shift || true

case "$command" in
extract) cmd_extract "$@" ;;
fetch) cmd_fetch "$@" ;;
info) cmd_info "$@" ;;
list) cmd_list ;;
help|--help|-h) usage ;;
version|--version|-v) echo "pdf-reader $VERSION" ;;
*)
echo "Error: Unknown command: $command" >&2
echo "Run 'pdf-reader help' for usage." >&2
exit 1
;;
esac
7 changes: 6 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,15 @@
"@clack/core": "^1.2.0",
"@clack/prompts": "^1.2.0",
"@onecli-sh/sdk": "^0.5.0",
"@types/qrcode": "1.5.6",
"@whiskeysockets/baileys": "7.0.0-rc.9",
"better-sqlite3": "11.10.0",
"chat": "^4.24.0",
"cron-parser": "5.5.0",
"kleur": "^4.1.5"
"kleur": "^4.1.5",
"openai": "4.104.0",
"pino": "9.6.0",
"qrcode": "1.5.4"
},
"devDependencies": {
"@eslint/js": "^9.35.0",
Expand Down
Loading
Loading