diff --git a/.gitignore b/.gitignore index 74ca64e..1e6bb15 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,44 @@ +# Dependency directories node_modules/ -docs/ -npm-debug.log + +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test + +# parcel-bundler cache (https://parceljs.org/) +.cache + +# TypeScript build output +dist/ + +# IDE specific files +.vscode/ +.idea/ +*.swp +*.swo + +# OS generated files +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 2fb5535..9e9a1f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,4 +1,16 @@ language: node_js node_js: -# - "0.10" - - 8 + - "14" + - "16" + - "18" + - "20" + +install: + - npm ci + +script: + - npm run lint + - npm test + +notifications: + email: false \ No newline at end of file diff --git a/Readme.md b/Readme.md index e75eac7..98ff3ea 100644 --- a/Readme.md +++ b/Readme.md @@ -1,87 +1,37 @@ -# Osmosis +# Node Osmosis Web Scraper -HTML/XML parser and web scraper for NodeJS. +## Project Overview +A robust and flexible web scraping library for Node.js. -[![NPM](https://nodei.co/npm/osmosis.png)](https://www.npmjs.com/package/osmosis) +## Prerequisites +- Node.js (>= 14.0.0) +- npm -[![Build Status](https://travis-ci.org/rchipka/node-osmosis.svg)](https://travis-ci.org/rchipka/node-osmosis) - -![Downloads](https://img.shields.io/npm/dm/osmosis.svg) - -## Features - -- Uses native libxml C bindings -- Clean promise-like interface -- Supports CSS 3.0 and XPath 1.0 selector hybrids -- [Sizzle selectors](https://github.com/jquery/sizzle/wiki#other-selectors-and-conventions), - [Slick selectors](http://mootools.net/core/docs/1.6.0/Slick/Slick), and - [more](https://github.com/rchipka/node-osmosis/blob/master/docs/Selectors.md) -- No large dependencies like jQuery, cheerio, or jsdom -- Compose deep and complex data structures - -- HTML parser features - - Fast parsing - - Very fast searching - - Small memory footprint - -- HTML DOM features - - Load and search ajax content - - DOM interaction and events - - Execute embedded and remote scripts - - Execute code in the DOM - -- HTTP request features - - Logs urls, redirects, and errors - - Cookie jar and custom cookies/headers/user agent - - Login/form submission, session cookies, and basic auth - - Single proxy or multiple proxies and handles proxy failure - - Retries and redirect limits - -## Example - -```javascript -var osmosis = require('osmosis'); +## Installation +```bash +npm install +``` -osmosis -.get('www.craigslist.org/about/sites') -.find('h1 + div a') -.set('location') -.follow('@href') -.find('header + div + div li > a') -.set('category') -.follow('@href') -.paginate('.totallink + a.button.next:first') -.find('p > a') -.follow('@href') -.set({ - 'title': 'section > h2', - 'description': '#postingbody', - 'subcategory': 'div.breadbox > span[4]', - 'date': 'time@datetime', - 'latitude': '#map@data-latitude', - 'longitude': '#map@data-longitude', - 'images': ['img@src'] -}) -.data(function(listing) { - // do something with listing data -}) -.log(console.log) -.error(console.log) -.debug(console.log) +## Running Tests +```bash +npm test ``` -## Documentation +## Development +- `npm run lint`: Run ESLint +- `npm run lint:fix`: Automatically fix linting issues -For documentation and examples check out [https://rchipka.github.io/node-osmosis/global.html](https://rchipka.github.io/node-osmosis/global.html) +## Configuration +The project uses ESLint for code quality and style checking. ## Dependencies - -- [libxmljs-dom](https://github.com/rchipka/node-libxmljs-dom) - DOM wrapper for [libxmljs](https://github.com/libxmljs/libxmljs) C bindings -- [needle](https://github.com/tomas/needle) - Lightweight HTTP wrapper - -## Donate - -Please consider a donation if you depend on web scraping and Osmosis makes your job a bit easier. -Your contribution allows me to spend more time making this the best web scraper for Node. - -[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donate_LG.gif)](https://www.paypal.com/cgi-bin/webscr?item_name=node-osmosis&cmd=_donations&business=NAXMWBMWKUWUU) +- libxmljs-dom: XML parsing +- needle: HTTP requests +- nodeunit: Testing framework + +## Contributing +1. Fork the repository +2. Create a feature branch +3. Commit your changes +4. Push to the branch +5. Create a Pull Request \ No newline at end of file diff --git a/package.json b/package.json index 5aa2568..6fe49fc 100644 --- a/package.json +++ b/package.json @@ -1,14 +1,14 @@ { - "name": "osmosis", - "version": "1.1.10", + "name": "node-osmosis", + "version": "1.2.0", "description": "Web scraper for NodeJS", "keywords": [ "web", - "scraper", - "crawler", - "html", - "xml", - "dom", + "scraper", + "crawler", + "html", + "xml", + "dom", "parser" ], "repository": { @@ -19,24 +19,40 @@ "name": "rchipka", "email": "chipka01@email.franklin.edu" }, + "main": "index.js", + "type": "module", + "engines": { + "node": ">=14.0.0" + }, "dependencies": { "libxmljs-dom": "~0.0.17", - "needle": "^1.6.0" + "needle": "^3.2.0" }, "devDependencies": { - "jscs": ">=3.0.2", - "nodeunit": "0.11.3" + "@types/node": "^18.15.11", + "eslint": "^8.38.0", + "eslint-config-standard": "^17.0.0", + "eslint-plugin-import": "^2.27.5", + "eslint-plugin-n": "^15.7.0", + "eslint-plugin-promise": "^6.1.1", + "nodeunit": "^0.11.3", + "typescript": "^5.0.4" }, "scripts": { - "test": "node ./node_modules/.bin/nodeunit test" + "test": "node --experimental-vm-modules ./node_modules/.bin/nodeunit test", + "lint": "eslint .", + "lint:fix": "eslint . --fix", + "prepare": "npm run lint" }, "license": "MIT", - "main": "index", - "engines": { - "node": ">= 0.8.0" - }, - "readmeFilename": "Readme.md", "bugs": { "url": "https://github.com/rchipka/node-osmosis/issues" + }, + "eslintConfig": { + "extends": "standard", + "rules": { + "no-unused-vars": "warn", + "no-console": "off" + } } -} +} \ No newline at end of file diff --git a/test/sample.js b/test/sample.js new file mode 100644 index 0000000..651b45b --- /dev/null +++ b/test/sample.js @@ -0,0 +1,6 @@ +const test = require('nodeunit'); + +exports.testSample = function(test) { + test.ok(true, 'This test should pass'); + test.done(); +}; \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..06591f2 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "es2020", + "module": "commonjs", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "outDir": "./dist", + "rootDir": "./lib" + }, + "include": ["lib/**/*"], + "exclude": ["node_modules", "**/*.spec.ts"] +} \ No newline at end of file