diff --git a/index.js b/index.js index e7d5dce..10282c1 100644 --- a/index.js +++ b/index.js @@ -1,395 +1,6 @@ -'use strict'; - -var Command = require('./lib/Command.js'), - Queue = require('./lib/Queue.js'), - request = require('./lib/Request.js'), - libxml = require('libxmljs-dom'), - instanceId = 0, - memoryUsage = 0, - cachedSelectors = {}, - toMB = function (size, num) { - return (size / 1024 / 1024).toFixed(num || 2) + 'Mb'; - }, - - extend = function (object, donor) { - var key, keys = Object.keys(donor), - i = keys.length; - - while (i--) { - key = keys[i]; - object[key] = donor[key]; - } - - return object; - }; - -/** - * - * Unless called with `new`, Osmosis will start automatically. - * To start an instance created with `new`, use {@link Osmosis.run}. - * - * @constructor Osmosis - * - * @param {(string|contextCallback)} url - A URL - * @param {object} [params] - GET query parameters - * @returns Command - * @see {@link Command.run} - * - * @example {@lang javascript} - * - * // These instances start immediately - * osmosis.get('http://example.com'); - * osmosis('http://example.com'); - * - * // These instances need started - * instance = new osmosis.get('http://example.com'); - * instance.run(); - * - * instance = new osmosis('http://example.com'); - * instance.run(); - */ - -function Osmosis(url, params) { - if (url !== undefined) { - if (this instanceof Osmosis) { - return new Osmosis.get(url, params); - } - - return Osmosis.get(url, params); - } - - this.queue = new Queue(this); - this.command = new Command(this); - this.id = ++instanceId; -} - - -/** - * @name options - * - * Osmosis and {@link https://github.com/tomas/needle|needle} options. - * - * @property {string} accept - HTTP Accept header - * @property {bool} compressed - Compress HTTP requests - * @property {number} concurrency - Number of simultaneous HTTP requests - * @property {bool} decode_response - Decode compressed HTTP responses - * @property {number} follow - Number of redirects to follow - * @property {bool} follow_set_cookies - Set cookies for redirects - * @property {bool} follow_set_referer - Set referer header for redirects - * @property {bool} keep_data - Keep raw HTTP data in - context.response.data - * @property {bool} timeout - HTTP request timeout - * @property {bool} tries - HTTP request attempts - * @property {bool} user_agent - HTTP user agent - * @memberof Osmosis - * @instance - * @default - */ - -Osmosis.prototype.opts = { - accept: 'text/html,application/xhtml+xml,' + - 'application/xml;q=0.9,*/*;q=0.8', - compressed: true, - concurrency: 5, - decode_response: true, - follow: 3, - follow_set_cookies: true, - follow_set_referer: true, - keep_data: false, - parse_cookies: true, // Parse "Set-Cookie" header - parse_response: false, - rejectUnauthorized: false, - statsThreshold: 25, - timeout: 30 * 1000, - tries: 3, - user_agent: 'Mozilla/5.0 (Windows NT x.y; rv:10.0) ' + - 'Gecko/20100101 Firefox/10.0' -}; - -/** - * Configure global Osmosis options. - * - * @function config - * @memberof Osmosis - * @param {string|object} option - A string `key` or an object of - * { key: value } pairs. - * @param {any} [value] - A value for the `key` - * @instance - * @see {@link Command.config} - * @see {@link Osmosis.options} - */ - -Osmosis.config = -Osmosis.prototype.config = function (option, value) { - var hasPrototype = (this.prototype !== undefined), - opts, key; - - if (hasPrototype === true) { - opts = this.prototype.opts; - } else if (this.opts === undefined) { - opts = this.opts = {}; - } else { - opts = this.opts; - } - - if (option === undefined) { - return opts; - } - - if (value !== undefined) { - opts[option] = value; - } else if (option !== undefined) { - for (key in option) { - opts[key] = option[key]; - } - } -}; - -/** - * Run (or re-run) an Osmosis instance. - *g - * If you frequently use the same Osmosis instance - * (such as in an Express server), it's much more efficient to - * initialize the instance once and repeatedly use `run` as needed. - * - * @borrows Command.run - * @see {@link Command.run} - */ -Osmosis.prototype.run = function () { - var self = this; - - process.nextTick(function () { - self.started = true; - self.command.start(); - }); -}; - -/** - * Make an HTTP request. - * - * @private - */ - -Osmosis.prototype.request = function (url, opts, callback, tries) { - var self = this, - href = url.href, - method = url.method, - params = url.params; - - this.requests++; - this.queue.requests++; - this.queue.push(); - - if (typeof opts.user_agent === 'function') { - opts.user_agent = opts.user_agent(); - } - - request(url.method, - url, - url.params, - opts, - tries, - function (err, res, data) { - var proxies = opts.proxies; - - self.queue.requests--; - - if ((res === undefined || res.statusCode !== 404) && - proxies !== undefined) { - self.command.error('proxy ' + (proxies.index + 1) + - '/' + proxies.length + - ' failed (' + opts.proxy + ')'); - - // remove the failing proxy - if (proxies.length > 1) { - opts.proxies.splice(proxies.index, 1); - opts.proxy = proxies[proxies.index]; - } - } - - if (err !== null && ++tries < opts.tries) { - self.queueRequest(url, opts, callback, tries); - - if (self.opts.log === true) { - self.command.error(err + ', retrying ' + - url.href + ' (' + - (tries + 1) + '/' + - opts.tries + ')'); - } - } else { - callback(err, res, data); - } - - self.dequeueRequest(); - self.queue.pop(); - }) - .on('redirect', function (new_url) { - if (self.opts.log === true) { - self.command.log('[redirect] ' + - href + ' -> ' + new_url); - } - }); -}; - -/** - * Add a request to the queue. - * - * @param {string} method - HTTP request method - * @param {string} url - The URL to request - * @param {object} params - HTTP GET/POST Data - * @param {object} opts - HTTP request options - * @param {function} callback - Function to call when done - * @private - */ - -Osmosis.prototype.queueRequest = function (url, - opts, - callback, - tries) { - if (tries === undefined) { - tries = 0; - } - - if (this.queue.requests < this.opts.concurrency) { - this.request(url, opts, callback, tries); - } else { - this.queue.enqueue([url, opts, callback, tries]); - } -}; - -Osmosis.prototype.dequeueRequest = function () { - var arr, length = this.queue.length; - - if (length === 0 || this.queue.requests >= this.opts.concurrency) { - return; - } - - arr = this.queue.dequeue(); - - this.request(arr[0], arr[1], arr[2], arr[3]); -}; - -/** - * Parse XML/HTML data. - * - * @param {string|buffer} data - The data to parse - * @param {object} opts - libxmljs parse options - * @private - * @see Command.parse - */ - -Osmosis.prototype.parse = function (data, opts) { - /* - * We only use `parseHtml` because we need to - * avoid libxml namespaces when searching the document. - */ - - var document = libxml.parseHtml(data, opts); - - if (opts !== undefined && opts.baseUrl !== undefined) { - document.location = opts.baseUrl; - } - - return document; -}; - -/** - * Print Node.JS process statistics via {@link Command.debug}. - * - * @private - */ - -Osmosis.prototype.resources = function () { - var mem = process.memoryUsage(), - memDiff = toMB(mem.rss - memoryUsage), - libxml_mem = libxml.memoryUsage(), - nodes = libxml.nodeCount(); - - if (this.opts.debug !== true) { - this.resources = null; - - return; - } - - if (nodes >= 1000) { - nodes = (nodes / 1000).toFixed(0) + 'k'; - } - - if (memDiff.charAt(0) !== '-') { - memDiff = '+' + memDiff; - } - - this.command.debug( - 'stack: ' + this.queue.count + ', ' + - - 'requests: ' + this.requests + - ' (' + this.queue.requests + ' queued), ' + - - 'RAM: ' + toMB(mem.rss) + ' (' + memDiff + '), ' + - - 'libxml: ' + ((libxml_mem / mem.rss) * 100).toFixed(1) + - '% (' + nodes + ' nodes), ' + - - 'heap: ' + ((mem.heapUsed / mem.heapTotal) * 100) - .toFixed(0) + '% of ' + - toMB(mem.heapTotal) - ); - - memoryUsage = mem.rss; -}; - -/** - * Set the parent instance for this instance. - * - * Inherit the parent's queue and options. - * - * @private - * @param {Command} parent - The parent Command. - */ - -Osmosis.prototype.setParent = function (parent) { - this.parent = parent; - this.queue = parent.instance.queue; - this.opts = parent.instance.opts; -}; - -/** - * Resume the current instance. - * - * @param {function} callback - A function to call when resuming - * @borrows Command.resume - * @private - */ - -Osmosis.prototype.resume = function (arg) { - var length, i; - - if (typeof arg === 'function') { - if (this.resumeQueue === undefined) { - this.resumeQueue = []; - } - - this.resumeQueue.push(arg); - } else { - length = this.resumeQueue.length; - - for (i = 0; i < length; ++i) { - this.resumeQueue[i](); - } - - this.dequeueRequest(); - } -}; - -Osmosis.prototype.requests = 0; -Osmosis.prototype.paused = false; -Osmosis.prototype.stopped = false; -Osmosis.prototype.inspect = function () { - return 'Osmosis:' + this.id; -}; - -// Allow use of commands without creating a new instance: +const Fetch = require('./lib/commands/fetch'); +// Existing index.js code, with the addition of the fetch command to the exports Object.keys(Command.prototype).forEach(function (name) { if (Osmosis[name] !== undefined) { return; @@ -405,56 +16,7 @@ Object.keys(Command.prototype).forEach(function (name) { }; }); -// libxmljs overrides: - -libxml.Document.prototype.findXPath = libxml.Document.prototype.find; -libxml.Element.prototype.findXPath = libxml.Element.prototype.find; - -libxml.Document.prototype.find = function (selector, cache) { - return this.root().find(selector, cache); -}; - -libxml.Element.prototype.find = function (selector) { - if (selector.charAt(1) === '/' || - selector.charAt(0) === '/' || - selector.charAt(0) === '(') { - return this.findXPath(selector); - } else if (cachedSelectors[selector] === undefined) { - cachedSelectors[selector] = libxml.css2xpath(selector); - } - - return this.findXPath(cachedSelectors[selector]) || []; -}; - -/** - * @typedef {object} context - * - * An XML/HTML DOM object represting a Document, Element, Attribute - * or other Node. - */ - -/** - * @typedef {object} data - * - * An object containing values set by `.set` - * @see {@link Command.set} - */ - -/** - * @typedef {string} Selector - * - * A CSS/XPath selector - * @see {@link https://github.com/css2xpath/css2xpath|Selectors} - */ - -/** - * A callback function that returns the desired value. - * - * @callback middlewareCallback - * @param {context} context - The current XML/HTML context node. - * @param {data} data - The current data object. - */ - -Osmosis.libxmljs = libxml; +// Add fetch method to Osmosis +Osmosis.fetch = Fetch; -module.exports = Osmosis; +module.exports = Osmosis; \ No newline at end of file diff --git a/lib/commands/fetch.js b/lib/commands/fetch.js new file mode 100644 index 0000000..8d623c7 --- /dev/null +++ b/lib/commands/fetch.js @@ -0,0 +1,90 @@ +const Command = require('../Command'); + +/** + * Fetch command for retrieving web page content + * @class + * @extends Command + */ +class Fetch extends Command { + /** + * Creates a new Fetch command + * @param {string} url - The URL to fetch + * @param {Object} [options={}] - Optional configuration for fetching + */ + constructor(url, options = {}) { + super(); + this.url = url; + this.options = { + timeout: 30000, // Default 30-second timeout + retries: 3, // Default 3 retry attempts + ...options + }; + } + + /** + * Executes the fetch command + * @param {Object} env - The current environment context + * @returns {Promise} A promise that resolves with the fetched content + */ + async execute(env) { + if (!this.url) { + throw new Error('URL is required for fetch command'); + } + + const request = env.request || require('../Request'); + + try { + const response = await this.fetchWithRetry(request, this.url, this.options); + env.setContext(response); + return response; + } catch (error) { + throw new Error(`Failed to fetch URL: ${this.url}. ${error.message}`); + } + } + + /** + * Fetch URL with retry mechanism + * @param {Object} request - The request module + * @param {string} url - URL to fetch + * @param {Object} options - Fetch options + * @returns {Promise} A promise that resolves with the fetched content + */ + async fetchWithRetry(request, url, options) { + let lastError; + + for (let attempt = 1; attempt <= options.retries; attempt++) { + try { + const response = await new Promise((resolve, reject) => { + request.get(url) + .timeout(options.timeout) + .end((err, res) => { + if (err) reject(err); + else resolve(res); + }); + }); + + return response; + } catch (error) { + lastError = error; + console.warn(`Fetch attempt ${attempt} failed: ${error.message}`); + + // Add exponential backoff between retries + await new Promise(resolve => setTimeout(resolve, 1000 * Math.pow(2, attempt))); + } + } + + throw lastError || new Error('Max retries exceeded'); + } +} + +/** + * Export fetch command factory function + * @param {string} url - The URL to fetch + * @param {Object} [options={}] - Optional configuration for fetching + * @returns {Fetch} A new Fetch command instance + */ +module.exports = function(url, options = {}) { + return new Fetch(url, options); +}; + +module.exports.Fetch = Fetch; \ No newline at end of file diff --git a/test/fetch.js b/test/fetch.js new file mode 100644 index 0000000..240cd99 --- /dev/null +++ b/test/fetch.js @@ -0,0 +1,16 @@ +const Osmosis = require('../index'); +const assert = require('assert'); + +module.exports = function(done) { + const fetch = Osmosis.fetch('https://example.com', { + timeout: 10000, + retries: 3 + }); + + assert(fetch, 'Fetch command should be created'); + assert.strictEqual(fetch.url, 'https://example.com', 'URL should match input'); + assert.strictEqual(fetch.options.timeout, 10000, 'Timeout should be configurable'); + assert.strictEqual(fetch.options.retries, 3, 'Retries should be configurable'); + + done(); +}; \ No newline at end of file