diff --git a/lib/stopwords.js b/lib/stopwords.js index 2a1888d..811be2a 100644 --- a/lib/stopwords.js +++ b/lib/stopwords.js @@ -1,24 +1,82 @@ // Generated by CoffeeScript 2.0.0-beta7 void function () { - var _, cache, candiateWords, fs, getFilePath, path, removePunctuation, stopwords; + var _, cache, candiateWords, fs, getFilePath, getStopwords, path, removePunctuation, stopwords; path = require('path'); fs = require('fs'); _ = require('lodash'); cache = {}; + getStopwords = function (lang) { + switch (lang) { + case 'ar': + return require('../data/stopwords/stopwords-ar.txt'); + case 'bg': + return require('../data/stopwords/stopwords-bg.txt'); + case 'cs': + return require('../data/stopwords/stopwords-cs.txt'); + case 'da': + return require('../data/stopwords/stopwords-da.txt'); + case 'de': + return require('../data/stopwords/stopwords-de.txt'); + case 'en': + return require('../data/stopwords/stopwords-en.txt'); + case 'es': + return require('../data/stopwords/stopwords-es.txt'); + case 'fi': + return require('../data/stopwords/stopwords-fi.txt'); + case 'fr': + return require('../data/stopwords/stopwords-fr.txt'); + case 'hu': + return require('../data/stopwords/stopwords-hu.txt'); + case 'id': + return require('../data/stopwords/stopwords-id.txt'); + case 'it': + return require('../data/stopwords/stopwords-it.txt'); + case 'ko': + return require('../data/stopwords/stopwords-ko.txt'); + case 'nb': + return require('../data/stopwords/stopwords-nb.txt'); + case 'nl': + return require('../data/stopwords/stopwords-nl.txt'); + case 'no': + return require('../data/stopwords/stopwords-no.txt'); + case 'pl': + return require('../data/stopwords/stopwords-pl.txt'); + case 'pt': + return require('../data/stopwords/stopwords-pt.txt'); + case 'ru': + return require('../data/stopwords/stopwords-ru.txt'); + case 'sv': + return require('../data/stopwords/stopwords-sv.txt'); + case 'th': + return require('../data/stopwords/stopwords-th.txt'); + case 'tr': + return require('../data/stopwords/stopwords-tr.txt'); + case 'zh': + return require('../data/stopwords/stopwords-zh.txt'); + default: + return require('../data/stopwords/stopwords-en.txt'); + } + }; getFilePath = function (language) { return path.join(__dirname, '..', 'data', 'stopwords', 'stopwords-' + language + '.txt'); }; module.exports = stopwords = function (content, language) { - var count, filePath, overlappingStopwords, stopWords, strippedInput, words; + var count, filePath, hasFs, overlappingStopwords, stopWords, strippedInput, words; if (null == language) language = 'en'; - filePath = getFilePath(language); - if (!fs.existsSync(filePath)) { - console.error("WARNING: No stopwords file found for '" + language + "' - defaulting to English!"); - filePath = getFilePath('en'); + hasFs = in$('existsSync', fs); + if (hasFs) { + filePath = getFilePath(language); + if (!fs.existsSync(filePath)) { + console.error("WARNING: No stopwords file found for '" + language + "' - defaulting to English!"); + filePath = getFilePath('en'); + } } if (cache.hasOwnProperty(language)) { stopWords = cache[language]; + } else if (!hasFs) { + stopWords = getStopwords(language); + cache[language] = stopWords; } else { stopWords = fs.readFileSync(filePath).toString().split('\n').filter(function (s) { return s.length > 0; @@ -46,4 +104,10 @@ void function () { candiateWords = function (strippedInput) { return strippedInput.split(' '); }; + function in$(member, list) { + for (var i = 0, length = list.length; i < length; ++i) + if (i in list && list[i] === member) + return true; + return false; + } }.call(this); diff --git a/src/stopwords.coffee b/src/stopwords.coffee index ecebd0c..4c9ac0c 100644 --- a/src/stopwords.coffee +++ b/src/stopwords.coffee @@ -4,20 +4,53 @@ _ = require('lodash') cache = {} +getStopwords = (lang) -> + switch (lang) + when 'ar' then require('../data/stopwords/stopwords-ar.txt') + when 'bg' then require('../data/stopwords/stopwords-bg.txt') + when 'cs' then require('../data/stopwords/stopwords-cs.txt') + when 'da' then require('../data/stopwords/stopwords-da.txt') + when 'de' then require('../data/stopwords/stopwords-de.txt') + when 'en' then require('../data/stopwords/stopwords-en.txt') + when 'es' then require('../data/stopwords/stopwords-es.txt') + when 'fi' then require('../data/stopwords/stopwords-fi.txt') + when 'fr' then require('../data/stopwords/stopwords-fr.txt') + when 'hu' then require('../data/stopwords/stopwords-hu.txt') + when 'id' then require('../data/stopwords/stopwords-id.txt') + when 'it' then require('../data/stopwords/stopwords-it.txt') + when 'ko' then require('../data/stopwords/stopwords-ko.txt') + when 'nb' then require('../data/stopwords/stopwords-nb.txt') + when 'nl' then require('../data/stopwords/stopwords-nl.txt') + when 'no' then require('../data/stopwords/stopwords-no.txt') + when 'pl' then require('../data/stopwords/stopwords-pl.txt') + when 'pt' then require('../data/stopwords/stopwords-pt.txt') + when 'ru' then require('../data/stopwords/stopwords-ru.txt') + when 'sv' then require('../data/stopwords/stopwords-sv.txt') + when 'th' then require('../data/stopwords/stopwords-th.txt') + when 'tr' then require('../data/stopwords/stopwords-tr.txt') + when 'zh' then require('../data/stopwords/stopwords-zh.txt') + else require('../data/stopwords/stopwords-en.txt') + getFilePath = (language) -> path.join(__dirname, "..", "data", "stopwords", "stopwords-#{language}.txt") # Given a language, loads a list of stop words for that language # and then returns which of those words exist in the given content module.exports = stopwords = (content, language = 'en') -> - filePath = getFilePath(language) + hasFs = 'existsSync' in fs + + if hasFs + filePath = getFilePath(language) - if !fs.existsSync(filePath) - console.error("WARNING: No stopwords file found for '#{language}' - defaulting to English!") - filePath = getFilePath('en') + if !fs.existsSync(filePath) + console.error("WARNING: No stopwords file found for '#{language}' - defaulting to English!") + filePath = getFilePath('en') if cache.hasOwnProperty(language) stopWords = cache[language] + else if !hasFs + stopWords = getStopwords(language) + cache[language] = stopWords else stopWords = fs.readFileSync(filePath).toString().split('\n') .filter((s) -> s.length > 0)