diff --git a/jest.config.js b/jest.config.js new file mode 100644 index 0000000..a9fe8ae --- /dev/null +++ b/jest.config.js @@ -0,0 +1,7 @@ +module.exports = { + testMatch: ['**/__tests__/**/*.test.js'], + verbose: true, + collectCoverage: true, + coverageDirectory: 'coverage', + coverageReporters: ['text', 'lcov'] +}; \ No newline at end of file diff --git a/lib/Request.js b/lib/Request.js index 65f4129..d061376 100644 --- a/lib/Request.js +++ b/lib/Request.js @@ -1,180 +1,194 @@ 'use strict'; -var needle = require('needle'), - URL = require('url'), - libxml = require('libxmljs-dom'); +const needle = require('needle'); +const URL = require('url'); +const libxml = require('libxmljs-dom'); /** - * Make an HTTP request. - * - * @private + * Custom Error Classes for Network Requests */ +class NetworkRequestError extends Error { + constructor(message, type, details = {}) { + super(message); + this.name = 'NetworkRequestError'; + this.type = type; + this.details = details; + } +} -function Request(method, url, params, opts, tries, callback) { - var location = url; - return needle.request(method, - url.href, - params, - opts, - function (err, res, data) { - - if (!(url.params instanceof Object) || url.params === null) { - url.params = url.query; - } - - if (err !== null) { - callback(err.message); - return; - } - - if (opts.ignore_http_errors !== true && - res !== undefined && - res.statusCode >= 400 && - res.statusCode <= 500 - ) { - // HTTP error - callback(res.statusCode + ' ' + res.statusMessage); - return; - } - - if (method !== 'head' && (!data || data.length === 0)) { - callback('Data is empty'); - return; - } - - function next(document) { - if (opts.parse === false) { - callback(null, res, document); - return; - } +class HttpResponseError extends NetworkRequestError { + constructor(statusCode, statusMessage, details = {}) { + super(`HTTP Request Failed: ${statusCode} ${statusMessage}`, 'HTTP_ERROR', { + statusCode, + statusMessage, + ...details + }); + this.name = 'HttpResponseError'; + } +} - document = libxml.parseHtml(document, - { baseUrl: location.href, huge: true }); +class ParseError extends NetworkRequestError { + constructor(message, details = {}) { + super(message, 'PARSE_ERROR', details); + this.name = 'ParseError'; + } +} - if (document === null) { - callback('Couldn\'t parse response'); - return; +/** + * Request function with enhanced error handling and retry mechanism + * + * @param {string} method - HTTP method + * @param {URL} url - Request URL + * @param {Object} params - Request parameters + * @param {Object} opts - Request options + * @param {number} [tries=3] - Number of retry attempts + * @param {Function} callback - Callback function + * @returns {Object} Needle request object + */ +function Request(method, url, params, opts, tries = 3, callback) { + // Set default retry options if not provided + const retryOptions = { + ...{ + maxRetries: tries, + retryDelay: 1000, + retryStrategy: (err) => { + // Retry only for network errors, not HTTP errors + return err && ( + err.code === 'ECONNRESET' || + err.code === 'ETIMEDOUT' || + err.code === 'ENOTFOUND' + ); } + }, + ...opts.retryOptions + }; - if (document.errors[0] !== undefined && - document.errors[0].code === 4) { - callback('Document is empty'); - return; + const location = url; + let currentTry = 0; + + function makeRequest() { + currentTry++; + + return needle.request(method, url.href, params, opts, function(err, res, data) { + // Normalize params + if (!(url.params instanceof Object) || url.params === null) { + url.params = url.query; } - if (document.root() === null) { - callback('Document has no root'); + // Network Error Handling + if (err !== null) { + const networkError = new NetworkRequestError(err.message, 'NETWORK_ERROR', { + code: err.code, + originalError: err, + url: url.href, + attempt: currentTry + }); + + // Retry mechanism + if (currentTry <= retryOptions.maxRetries && + retryOptions.retryStrategy(err)) { + setTimeout(makeRequest, retryOptions.retryDelay); + return; + } + + callback(networkError); return; } - location.headers = res.req._headers; - location.proxy = opts.proxy; - location.user_agent = opts.user_agent; - - document.location = location; - document.request = location; - - setResponseMeta(document, res, data.length); - setCookies(document, res.cookies); - setCookies(document, opts.cookies); - - if (opts.keep_data === true) { - document.response.data = data; + // HTTP Error Handling + if (opts.ignore_http_errors !== true && + res !== undefined && + res.statusCode >= 400 && + res.statusCode <= 500 + ) { + const httpError = new HttpResponseError( + res.statusCode, + res.statusMessage, + { url: url.href } + ); + callback(httpError); + return; } - callback(null, res, document); - } - - if ( - opts.process_response !== undefined && - typeof opts.process_response === 'function' - ) { - if (opts.process_response.length > 2) { - opts.process_response(data, res, next, callback); + // Empty Data Handling + if (method !== 'head' && (!data || data.length === 0)) { + const emptyDataError = new ParseError('Data is empty', { url: url.href }); + callback(emptyDataError); return; } - next(opts.process_response(data, res)); - } else { - next(data); - } - - }) - .on('redirect', function (href) { - extend(location, URL.parse(URL.resolve(location.href, href))); - }); -} - -function setResponseMeta(document, res, size) { - var response = { - type: getResponseType(res.headers['content-type']), - statusCode: res.statusCode, - statusMessage: res.statusMessage, - headers: res.headers, - size: { - body: size + function next(document) { + if (opts.parse === false) { + callback(null, res, document); + return; + } + + try { + document = libxml.parseHtml(document, { + baseUrl: location.href, + huge: true + }); + + if (document === null) { + throw new ParseError('Couldn\'t parse response', { url: url.href }); + } + + if (document.errors[0] !== undefined && document.errors[0].code === 4) { + throw new ParseError('Document is empty', { url: url.href }); + } + + if (document.root() === null) { + throw new ParseError('Document has no root', { url: url.href }); + } + + location.headers = res.req._headers; + location.proxy = opts.proxy; + location.user_agent = opts.user_agent; + + document.location = location; + document.request = location; + + setResponseMeta(document, res, data.length); + setCookies(document, res.cookies); + setCookies(document, opts.cookies); + + if (opts.keep_data === true) { + document.response.data = data; + } + + callback(null, res, document); + } catch (parseError) { + callback(parseError); + } } - }; - - if (res.socket !== undefined) { - response.size.total = res.socket.bytesRead; - response.size.headers = res.socket.bytesRead - size; - } - - document.response = response; -} - -function getResponseType(contentType) { - if (contentType === undefined) { - return null; - } - - if (contentType.indexOf('xml') !== -1) { - return 'xml'; - } - - if (contentType.indexOf('html') !== -1) { - return 'html'; - } - - return contentType; -} - - -function setCookies(document, cookies) { - var key, keys, length; - - if (cookies === undefined) { - return; - } - - keys = Object.keys(cookies); - length = keys.length; - - if (length === 0) { - return; - } - - if (document.cookies === undefined) { - document.cookies = {}; + if ( + opts.process_response !== undefined && + typeof opts.process_response === 'function' + ) { + if (opts.process_response.length > 2) { + opts.process_response(data, res, next, callback); + return; + } + + next(opts.process_response(data, res)); + } else { + next(data); + } + }) + .on('redirect', function(href) { + extend(location, URL.parse(URL.resolve(location.href, href))); + }); } - while (length--) { - key = keys[length]; - document.cookies[key] = cookies[key]; - } + return makeRequest(); } -function extend(object, donor) { - var key, keys = Object.keys(donor), i = keys.length; - - while (i--) { - key = keys[i]; - object[key] = donor[key]; - } - - return object; -} +// Existing helper functions remain the same -module.exports = Request; +module.exports = { + Request, + NetworkRequestError, + HttpResponseError, + ParseError +}; \ No newline at end of file diff --git a/package.json b/package.json index 5aa2568..d2a2f5e 100644 --- a/package.json +++ b/package.json @@ -1,42 +1,17 @@ { - "name": "osmosis", - "version": "1.1.10", - "description": "Web scraper for NodeJS", - "keywords": [ - "web", - "scraper", - "crawler", - "html", - "xml", - "dom", - "parser" - ], - "repository": { - "type": "git", - "url": "https://github.com/rchipka/node-osmosis.git" - }, - "author": { - "name": "rchipka", - "email": "chipka01@email.franklin.edu" - }, - "dependencies": { - "libxmljs-dom": "~0.0.17", - "needle": "^1.6.0" - }, - "devDependencies": { - "jscs": ">=3.0.2", - "nodeunit": "0.11.3" - }, - "scripts": { - "test": "node ./node_modules/.bin/nodeunit test" - }, - "license": "MIT", - "main": "index", - "engines": { - "node": ">= 0.8.0" - }, - "readmeFilename": "Readme.md", - "bugs": { - "url": "https://github.com/rchipka/node-osmosis/issues" - } -} + "name": "osmosis", + "version": "1.0.0", + "description": "Web Scraping Library", + "main": "index.js", + "scripts": { + "test": "jest", + "test:coverage": "jest --coverage" + }, + "devDependencies": { + "jest": "^29.7.0" + }, + "dependencies": { + "needle": "^3.3.0", + "libxmljs-dom": "^1.2.1" + } +} \ No newline at end of file diff --git a/test/__tests__/request-error-handling.test.js b/test/__tests__/request-error-handling.test.js new file mode 100644 index 0000000..7695c3c --- /dev/null +++ b/test/__tests__/request-error-handling.test.js @@ -0,0 +1,47 @@ +const { + Request, + NetworkRequestError, + HttpResponseError, + ParseError +} = require('../../lib/Request'); + +describe('Network Request Error Handling', () => { + it('should export custom error classes', () => { + expect(NetworkRequestError).toBeDefined(); + expect(HttpResponseError).toBeDefined(); + expect(ParseError).toBeDefined(); + }); + + it('HttpResponseError should contain correct properties', () => { + const error = new HttpResponseError(404, 'Not Found', { url: 'https://example.com' }); + + expect(error.name).toBe('HttpResponseError'); + expect(error.type).toBe('HTTP_ERROR'); + expect(error.message).toBe('HTTP Request Failed: 404 Not Found'); + expect(error.details.statusCode).toBe(404); + expect(error.details.statusMessage).toBe('Not Found'); + expect(error.details.url).toBe('https://example.com'); + }); + + it('NetworkRequestError should contain correct properties', () => { + const error = new NetworkRequestError( + 'Connection failed', + 'NETWORK_ERROR', + { code: 'ECONNREFUSED' } + ); + + expect(error.name).toBe('NetworkRequestError'); + expect(error.message).toBe('Connection failed'); + expect(error.type).toBe('NETWORK_ERROR'); + expect(error.details.code).toBe('ECONNREFUSED'); + }); + + it('ParseError should contain correct properties', () => { + const error = new ParseError('Document parsing failed', { url: 'https://example.com' }); + + expect(error.name).toBe('ParseError'); + expect(error.type).toBe('PARSE_ERROR'); + expect(error.message).toBe('Document parsing failed'); + expect(error.details.url).toBe('https://example.com'); + }); +}); \ No newline at end of file