Skip to content

Commit

Permalink
Merge pull request #446 from pelias/joxit/feat/multi-lang-index
Browse files Browse the repository at this point in the history
Add index for all wof languages (when differents to default language)
  • Loading branch information
orangejulius authored May 20, 2019
2 parents b831c23 + 505997a commit 9a3fc2c
Show file tree
Hide file tree
Showing 6 changed files with 384 additions and 5 deletions.
24 changes: 23 additions & 1 deletion src/components/extractFields.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
const through2 = require('through2');
const _ = require('lodash');
const util = require('util');
const iso639 = require('../helpers/iso639');

// hierarchy in importance-descending order of population fields
const population_hierarchy = [
Expand All @@ -26,6 +27,8 @@ const NAME_ALIAS_FIELDS = [
'label:%s_x_preferred'
];

const WOF_NAMES_REGEX = /(name|label):[a-z]{3}_x_(preferred|variant)/;

// this function is used to verify that a US county QS altname is available
function isUsCounty(base_record, wof_country, qs_a2_alt) {
return 'US' === wof_country &&
Expand Down Expand Up @@ -131,6 +134,23 @@ function getNameAliases(properties) {
return concatArrayFields(properties, nameFields);
}

function getMultiLangNames(defaultName, properties) {
return Object.keys(properties)
.filter(key => WOF_NAMES_REGEX.test(key)) // get only name:.* keys
.map(key => {
return {
key: key.substring(key.indexOf(':') + 1, key.indexOf(':') + 4), // get the iso part of the key name:iso_x_preferred
value: properties[key]
.filter(name => !defaultName || defaultName.indexOf(name) < 0) // remove duplicate elements found in default name
};
}) //
.filter(({ key, value }) => value.length > 0 && iso639[key]) // filter correct iso 3 keys
.map(({key, value}) => { return { key: iso639[key], value: value }; })
.reduce((langs, { key, value }) =>
_.set(langs, key, _.union(langs[key], value)), {}
); // create the lang/value map
}

function getAbbreviation(properties) {
if (properties['wof:placetype'] === 'country' && properties['wof:country']) {
return properties['wof:country'];
Expand Down Expand Up @@ -167,10 +187,12 @@ function getHierarchies(id, properties) {
*/
module.exports.create = function map_fields_stream() {
return through2.obj(function(json_object, enc, callback) {
const default_names = getName(json_object.properties);
var record = {
id: json_object.id,
name: getName(json_object.properties),
name: default_names,
name_aliases: getNameAliases(json_object.properties),
name_langs: getMultiLangNames(default_names, json_object.properties),
abbreviation: getAbbreviation(json_object.properties),
place_type: json_object.properties['wof:placetype'],
lat: getLat(json_object.properties),
Expand Down
207 changes: 207 additions & 0 deletions src/helpers/iso639.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
// Based on https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
module.exports = {
'abk': 'ab',
'aar': 'aa',
'afr': 'af',
'aka': 'ak',
'alb': 'sq',
'sqi': 'sq',
'amh': 'am',
'ara': 'ar',
'arg': 'an',
'hye': 'hy',
'arm': 'hy',
'asm': 'as',
'ava': 'av',
'ave': 'ae',
'aym': 'ay',
'aze': 'az',
'bam': 'bm',
'bak': 'ba',
'eus': 'eu',
'baq': 'eu',
'bel': 'be',
'ben': 'bn',
'bih': 'bh',
'bis': 'bi',
'bos': 'bs',
'bre': 'br',
'bul': 'bg',
'mya': 'my',
'bur': 'my',
'cat': 'ca',
'cha': 'ch',
'che': 'ce',
'nya': 'ny',
'chi': 'zh',
'zho': 'zh',
'chv': 'cv',
'cor': 'kw',
'cos': 'co',
'cre': 'cr',
'hrv': 'hr',
'ces': 'cs',
'cze': 'cs',
'dan': 'da',
'div': 'dv',
'nld': 'nl',
'dut': 'nl',
'dzo': 'dz',
'eng': 'en',
'epo': 'eo',
'est': 'et',
'ewe': 'ee',
'fao': 'fo',
'fij': 'fj',
'fin': 'fi',
'fra': 'fr',
'fre': 'fr',
'ful': 'ff',
'glg': 'gl',
'kat': 'ka',
'geo': 'ka',
'deu': 'de',
'ger': 'de',
'ell': 'el',
'gre': 'el',
'grn': 'gn',
'guj': 'gu',
'hat': 'ht',
'hau': 'ha',
'heb': 'he',
'her': 'hz',
'hin': 'hi',
'hmo': 'ho',
'hun': 'hu',
'ina': 'ia',
'ind': 'id',
'ile': 'ie',
'gle': 'ga',
'ibo': 'ig',
'ipk': 'ik',
'ido': 'io',
'isl': 'is',
'ice': 'is',
'ita': 'it',
'iku': 'iu',
'jpn': 'ja',
'jav': 'jv',
'kal': 'kl',
'kan': 'kn',
'kau': 'kr',
'kas': 'ks',
'kaz': 'kk',
'khm': 'km',
'kik': 'ki',
'kin': 'rw',
'kir': 'ky',
'kom': 'kv',
'kon': 'kg',
'kor': 'ko',
'kur': 'ku',
'kua': 'kj',
'lat': 'la',
'ltz': 'lb',
'lug': 'lg',
'lim': 'li',
'lin': 'ln',
'lao': 'lo',
'lit': 'lt',
'lub': 'lu',
'lav': 'lv',
'glv': 'gv',
'mkd': 'mk',
'mac': 'mk',
'mlg': 'mg',
'may': 'ms',
'msa': 'ms',
'mal': 'ml',
'mlt': 'mt',
'mri': 'mi',
'mao': 'mi',
'mar': 'mr',
'mah': 'mh',
'mon': 'mn',
'nau': 'na',
'nav': 'nv',
'nde': 'nd',
'nep': 'ne',
'ndo': 'ng',
'nob': 'nb',
'nno': 'nn',
'nor': 'no',
'iii': 'ii',
'nbl': 'nr',
'oci': 'oc',
'oji': 'oj',
'chu': 'cu',
'orm': 'om',
'ori': 'or',
'oss': 'os',
'pan': 'pa',
'pli': 'pi',
'per': 'fa',
'fas': 'fa',
'pol': 'pl',
'pus': 'ps',
'por': 'pt',
'que': 'qu',
'roh': 'rm',
'run': 'rn',
'ron': 'ro',
'rum': 'ro',
'rus': 'ru',
'san': 'sa',
'srd': 'sc',
'snd': 'sd',
'sme': 'se',
'smo': 'sm',
'sag': 'sg',
'srp': 'sr',
'gla': 'gd',
'sna': 'sn',
'sin': 'si',
'slk': 'sk',
'slo': 'sk',
'slv': 'sl',
'som': 'so',
'sot': 'st',
'spa': 'es',
'sun': 'su',
'swa': 'sw',
'ssw': 'ss',
'swe': 'sv',
'tam': 'ta',
'tel': 'te',
'tgk': 'tg',
'tha': 'th',
'tir': 'ti',
'bod': 'bo',
'tib': 'bo',
'tuk': 'tk',
'tgl': 'tl',
'tsn': 'tn',
'ton': 'to',
'tur': 'tr',
'tso': 'ts',
'tat': 'tt',
'twi': 'tw',
'tah': 'ty',
'uig': 'ug',
'ukr': 'uk',
'urd': 'ur',
'uzb': 'uz',
'ven': 've',
'vie': 'vi',
'vol': 'vo',
'wln': 'wa',
'cym': 'cy',
'wel': 'cy',
'wol': 'wo',
'fry': 'fy',
'xho': 'xh',
'yid': 'yi',
'yor': 'yo',
'zha': 'za',
'zul': 'zu'
};
25 changes: 21 additions & 4 deletions src/peliasDocGenerators.js
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@ function assignField(hierarchyElement, wofDoc) {

}

function addMultiLangAliases(wofDoc, name_langs) {
for (let lang in name_langs) {
for (let i = 0; i < name_langs[lang].length; i++) {
if (i === 0) {
wofDoc.setName(lang, name_langs[lang][i]);
} else {
wofDoc.setNameAlias(lang, name_langs[lang][i]);
}
}
}
}

// method that extracts the logic for Document creation. `hierarchy` is optional
function setupDocument(record, hierarchy) {
var wofDoc = new Document( 'whosonfirst', record.place_type, record.id );
Expand All @@ -73,10 +85,15 @@ function setupDocument(record, hierarchy) {
}

// index name aliases for all other records (where available)
else if (record.name_aliases.length) {
record.name_aliases.forEach(alias => {
wofDoc.setNameAlias('default', alias);
});
else {
if (record.name_aliases.length) {
record.name_aliases.forEach(alias => {
wofDoc.setNameAlias('default', alias);
});
}
if (record.name_langs) {
addMultiLangAliases(wofDoc, record.name_langs);
}
}
}
wofDoc.setCentroid({ lat: record.lat, lon: record.lon });
Expand Down
Loading

0 comments on commit 9a3fc2c

Please sign in to comment.