ExaDev · Mearman · Jul 30, 2025
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -78,8 +78,10 @@
     "@semantic-release/changelog": "^6.0.3",
     "@semantic-release/exec": "^7.1.0",
     "@semantic-release/git": "^10.0.1",
+    "@types/jsdom": "^21.1.7",
     "@types/mdast": "^4.0.4",
     "@types/node": "^24.0.1",
+    "@types/turndown": "^5.0.5",
     "@types/unist": "^3.0.0",
     "@typescript-eslint/eslint-plugin": "^8.34.0",
     "@typescript-eslint/parser": "^8.34.0",
@@ -108,11 +110,15 @@
   },
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.12.3",
+    "@mozilla/readability": "^0.6.0",
     "commander": "^14.0.0",
     "glob": "^11.0.3",
+    "jsdom": "^26.1.0",
+    "node-html-parser": "^7.0.1",
     "remark": "^15.0.0",
     "remark-parse": "^11.0.0",
     "remark-stringify": "^11.0.0",
+    "turndown": "^7.2.0",
     "unified": "^11.0.0",
     "unist-util-visit": "^5.0.0"
   },

diff --git a/src/cli.test.ts b/src/cli.test.ts
@@ -146,8 +146,8 @@ describe('CLI Entry Point', () => {
     it('should set action handlers for commands', async () => {
       await import('./cli.js');
 
-      // Should call action 9 times (once for each command: convert, move, split, join, merge, index, barrel, toc, validate)
-      expect(mockAction).toHaveBeenCalledTimes(9);
+      // Should call action 10 times (once for each command: clip, convert, move, split, join, merge, index, barrel, toc, validate)
+      expect(mockAction).toHaveBeenCalledTimes(10);
     });
 
     it('should add help text for convert command', async () => {

diff --git a/src/cli.ts b/src/cli.ts
@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 
 import { Command } from 'commander';
+import { clipCommand } from './commands/clip.js';
 import { convertCommand } from './commands/convert.js';
 import { indexCommand } from './commands/index.js';
 import { joinCommand } from './commands/join.js';
@@ -17,6 +18,68 @@ program
   .description('CLI for markdown file operations with intelligent link refactoring')
   .version('0.1.0');
 
+program
+  .command('clip')
+  .description('Convert web pages to markdown (web clipper)')
+  .argument('<urls...>', 'URLs to clip or paths to files containing URLs (use --batch)')
+  .option('-o, --output <file>', 'Output file name (single URL only)')
+  .option('--output-dir <dir>', 'Output directory for clipped files')
+  .option('--batch', 'Process multiple URLs from input files')
+  .option(
+    '--strategy <strategy>',
+    'Extraction strategy: auto|readability|manual|full|structured',
+    'auto'
+  )
+  .option(
+    '--image-strategy <strategy>',
+    'Image handling: skip|link-only|download|base64',
+    'link-only'
+  )
+  .option('--image-dir <dir>', 'Directory for downloaded images', './images')
+  .option('--selectors <selectors>', 'CSS selectors for manual extraction (comma-separated)')
+  .option('--no-frontmatter', 'Skip frontmatter generation')
+  .option('--timeout <ms>', 'Request timeout in milliseconds', parseInt, 30000)
+  .option('--user-agent <agent>', 'Custom User-Agent string')
+  .option('--headers <headers>', 'Custom HTTP headers (JSON format)')
+  .option('--cookies <file>', 'Path to cookies file')
+  .option('--no-follow-redirects', 'Don\'t follow HTTP redirects')
+  .option('--max-redirects <count>', 'Maximum redirects to follow', parseInt, 5)
+  .option('-d, --dry-run', 'Show what would be clipped without creating files')
+  .option('-v, --verbose', 'Show detailed output with processing information')
+  .option('--json', 'Output results in JSON format')
+  .addHelpText(
+    'after',
+    `
+Examples:
+  $ markmv clip https://example.com/article
+  $ markmv clip https://example.com/article -o article.md
+  $ markmv clip urls.txt --batch --output-dir ./clipped
+  $ markmv clip https://docs.site.com --strategy manual --selectors "article,.content"
+  $ markmv clip https://blog.com/post --strategy readability --image-strategy download
+  $ markmv clip https://example.com --dry-run --verbose
+
+Extraction Strategies:
+  auto         Automatically choose best strategy based on content
+  readability  Mozilla Readability algorithm (best for articles/blogs)
+  manual       Extract using custom CSS selectors
+  full         Extract entire page content
+  structured   Use Schema.org and semantic markup
+
+Image Strategies:
+  skip         Don't process images at all
+  link-only    Keep images as external links (fastest)
+  download     Download images locally and update paths
+  base64       Embed small images as base64 (increases file size)
+
+Advanced Features:
+  --headers '{"Authorization": "Bearer token"}'    Custom headers for auth
+  --cookies cookies.txt                            Use cookies for protected content
+  --selectors "article,.post-content,main"        Custom content selectors
+  --timeout 60000                                  Extended timeout for slow sites
+  --user-agent "Custom Bot 1.0"                   Custom user agent string`
+  )
+  .action(clipCommand);
+
 program
   .command('convert')
   .description('Convert markdown link formats and path resolution')