move content from osm-search/Nominatim repository

osm-search · Jul 1, 2020 · 403060d · 403060d
1 parent f1f2569
commit 403060d
Show file tree

Hide file tree

Showing 8 changed files with 2,218 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,63 @@
+## Add Wikipedia and Wikidata to Nominatim
+
+OSM contributors frequently tag items with links to Wikipedia and Wikidata. Nominatim can use the page ranking of Wikipedia pages to help indicate the relative importance of osm features. This is done by calculating an importance score between 0 and 1 based on the number of inlinks to an article for a location. If two places have the same name and one is more important than the other, the wikipedia score often points to the correct place. 
+
+These scripts extract and prepare both Wikipedia page rank and Wikidata links for use in Nominatim.  
+
+#### Create a new postgres DB for Processing
+
+Due to the size of initial and intermediate tables, processing can be done in an external database:
+```
+CREATE DATABASE wikiprocessingdb;
+```
+---
+Wikipedia
+---  
+
+Processing these data requires a large amount of disk space (~1TB) and considerable time (>24 hours).
+
+#### Import & Process Wikipedia tables
+
+This step downloads and converts [Wikipedia](https://dumps.wikimedia.org/) page data SQL dumps to postgreSQL files which can be imported and processed with pagelink information from Wikipedia language sites to calculate importance scores.
+
+- The script will processes data from whatever set of Wikipedia languages are specified in the initial languages array
+
+- Note that processing the top 40 Wikipedia languages can take over a day, and will add nearly 1TB to the processing database. The final output tables will be approximately 11GB and 2GB in size
+
+To download, convert, and import the data, then process summary statistics and compute importance scores, run:
+```
+./import_wikipedia.sh
+```
+---
+Wikidata
+---
+
+This script downloads and processes Wikidata to enrich the previously created Wikipedia tables for use in Nominatim.
+
+#### Import & Process Wikidata
+
+This step downloads and converts [Wikidata](https://dumps.wikimedia.org/wikidatawiki/) page data SQL dumps to postgreSQL files which can be processed and imported into Nominatim database. Also utilizes Wikidata Query Service API to discover and include place types.
+
+- Script presumes that the user has already processed Wikipedia tables as specified above
+
+- Script requires wikidata_place_types.txt and wikidata_place_type_levles.csv
+
+- script requires the [jq json parser](https://stedolan.github.io/jq/)
+
+- Script processes data from whatever set of Wikipedia languages are specified in the initial languages array
+
+- Script queries Wikidata Query Service API and imports all instances of place types listed in wikidata_place_types.txt
+
+- Script updates wikipedia_articles table with extracted wikidata 
+
+By including Wikidata in the wikipedia_articles table, new connections can be made on the fly from the Nominatim placex table to wikipedia_article importance scores. 
+
+To download, convert, and import the data, then process required items, run:
+``` 
+./import_wikidata.sh
+```
+
+
+License
+-------
+The source code is available under a GPLv2 license.
diff --git a/import_wikidata.sh b/import_wikidata.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+psqlcmd() {
+     psql --quiet wikiprocessingdb
+}
+
+mysql2pgsqlcmd() {
+     ./mysql2pgsql.perl /dev/stdin /dev/stdout
+}
+
+download() {
+     echo "Downloading $1"
+     wget --quiet --no-clobber --tries 3 "$1"
+}
+
+# languages to process (refer to List of Wikipedias here: https://en.wikipedia.org/wiki/List_of_Wikipedias)
+# requires Bash 4.0
+readarray -t LANGUAGES < languages.txt
+
+
+
+echo "====================================================================="
+echo "Download wikidata dump tables"
+echo "====================================================================="
+
+# 114M  wikidatawiki-latest-geo_tags.sql.gz
+# 1.7G  wikidatawiki-latest-page.sql.gz
+# 1.2G  wikidatawiki-latest-wb_items_per_site.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-geo_tags.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-page.sql.gz
+download https://dumps.wikimedia.org/wikidatawiki/latest/wikidatawiki-latest-wb_items_per_site.sql.gz
+
+
+
+
+echo "====================================================================="
+echo "Import wikidata dump tables"
+echo "====================================================================="
+
+echo "Importing wikidatawiki-latest-geo_tags"
+gzip -dc wikidatawiki-latest-geo_tags.sql.gz          | mysql2pgsqlcmd | psqlcmd
+
+echo "Importing wikidatawiki-latest-page"
+gzip -dc wikidatawiki-latest-page.sql.gz              | mysql2pgsqlcmd | psqlcmd
+
+echo "Importing wikidatawiki-latest-wb_items_per_site"
+gzip -dc wikidatawiki-latest-wb_items_per_site.sql.gz | mysql2pgsqlcmd | psqlcmd
+
+
+
+
+
+
+echo "====================================================================="
+echo "Get wikidata places from wikidata query API"
+echo "====================================================================="
+
+echo "Number of place types:"
+wc -l wikidata_place_types.txt
+
+while read F  ; do
+    echo "Querying for place type $F..."
+    wget --quiet "https://query.wikidata.org/bigdata/namespace/wdq/sparql?format=json&query=SELECT ?item WHERE{?item wdt:P31*/wdt:P279*wd:$F;}" -O $F.json
+    jq -r '.results | .[] | .[] | [.item.value] | @csv' $F.json >> $F.txt
+    awk -v qid=$F '{print $0 ","qid}' $F.txt | sed -e 's!"http://www.wikidata.org/entity/!!' | sed 's/"//g' >> $F.csv
+    cat $F.csv >> wikidata_place_dump.csv
+    rm $F.json $F.txt $F.csv
+done < wikidata_place_types.txt
+
+
+
+
+echo "====================================================================="
+echo "Import wikidata places"
+echo "====================================================================="
+
+echo "CREATE TABLE wikidata_place_dump (
+        item        text,
+        instance_of text
+      );"  | psqlcmd
+
+echo "COPY wikidata_place_dump (item, instance_of)
+      FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_dump.csv'
+      DELIMITER ','
+      CSV
+      ;"  | psqlcmd
+
+echo "CREATE TABLE wikidata_place_type_levels (
+        place_type text,
+        level      integer
+      );" | psqlcmd
+
+echo "COPY wikidata_place_type_levels (place_type, level)
+      FROM '/srv/nominatim/Nominatim/data-sources/wikipedia-wikidata/wikidata_place_type_levels.csv'
+      DELIMITER ','
+      CSV
+      HEADER
+      ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Create derived tables"
+echo "====================================================================="
+
+echo "CREATE TABLE geo_earth_primary AS
+      SELECT gt_page_id,
+             gt_lat,
+             gt_lon
+      FROM geo_tags
+      WHERE gt_globe = 'earth'
+        AND gt_primary = 1
+        AND NOT(    gt_lat < -90
+                 OR gt_lat > 90
+                 OR gt_lon < -180
+                 OR gt_lon > 180
+                 OR gt_lat=0
+                 OR gt_lon=0)
+      ;" | psqlcmd
+
+echo "CREATE TABLE geo_earth_wikidata AS
+      SELECT DISTINCT geo_earth_primary.gt_page_id,
+                      geo_earth_primary.gt_lat,
+                      geo_earth_primary.gt_lon,
+                      page.page_title,
+                      page.page_namespace
+      FROM geo_earth_primary
+      LEFT OUTER JOIN page
+                   ON (geo_earth_primary.gt_page_id = page.page_id)
+      ORDER BY geo_earth_primary.gt_page_id
+      ;" | psqlcmd
+
+echo "ALTER TABLE wikidata_place_dump
+      ADD COLUMN ont_level integer,
+      ADD COLUMN lat numeric(11,8),
+      ADD COLUMN lon numeric(11,8)
+      ;" | psqlcmd
+
+echo "UPDATE wikidata_place_dump
+      SET ont_level = wikidata_place_type_levels.level
+      FROM wikidata_place_type_levels
+      WHERE wikidata_place_dump.instance_of = wikidata_place_type_levels.place_type
+      ;" | psqlcmd
+
+echo "CREATE TABLE wikidata_places
+      AS
+      SELECT DISTINCT ON (item) item,
+                                instance_of,
+                                MAX(ont_level) AS ont_level,
+                                lat,
+                                lon
+      FROM wikidata_place_dump
+      GROUP BY item,
+               instance_of,
+               ont_level,
+               lat,
+               lon
+      ORDER BY item
+      ;" | psqlcmd
+
+echo "UPDATE wikidata_places
+      SET lat = geo_earth_wikidata.gt_lat,
+          lon = geo_earth_wikidata.gt_lon
+      FROM geo_earth_wikidata
+      WHERE wikidata_places.item = geo_earth_wikidata.page_title
+      ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Process language pages"
+echo "====================================================================="
+
+
+echo "CREATE TABLE wikidata_pages (
+        item          text,
+        instance_of   text,
+        lat           numeric(11,8),
+        lon           numeric(11,8),
+        ips_site_page text,
+        language      text
+      );" | psqlcmd
+
+for i in "${LANGUAGES[@]}"
+do
+   echo "CREATE TABLE wikidata_${i}_pages AS
+         SELECT wikidata_places.item,
+                wikidata_places.instance_of,
+                wikidata_places.lat,
+                wikidata_places.lon,
+                wb_items_per_site.ips_site_page
+         FROM wikidata_places
+         LEFT JOIN wb_items_per_site
+                ON (CAST (( LTRIM(wikidata_places.item, 'Q')) AS INTEGER) = wb_items_per_site.ips_item_id)
+         WHERE ips_site_id = '${i}wiki'
+           AND LEFT(wikidata_places.item,1) = 'Q'
+         ORDER BY wikidata_places.item
+         ;" | psqlcmd
+
+   echo "ALTER TABLE wikidata_${i}_pages
+         ADD COLUMN language text
+         ;" | psqlcmd
+
+   echo "UPDATE wikidata_${i}_pages
+         SET language = '${i}'
+         ;" | psqlcmd
+
+   echo "INSERT INTO wikidata_pages
+         SELECT item,
+                instance_of,
+                lat,
+                lon,
+                ips_site_page,
+                language
+         FROM wikidata_${i}_pages
+         ;" | psqlcmd
+done
+
+echo "ALTER TABLE wikidata_pages
+      ADD COLUMN wp_page_title text
+      ;" | psqlcmd
+echo "UPDATE wikidata_pages
+      SET wp_page_title = REPLACE(ips_site_page, ' ', '_')
+      ;" | psqlcmd
+echo "ALTER TABLE wikidata_pages
+      DROP COLUMN ips_site_page
+      ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Add wikidata to wikipedia_article table"
+echo "====================================================================="
+
+echo "UPDATE wikipedia_article
+      SET lat = wikidata_pages.lat,
+          lon = wikidata_pages.lon,
+          wd_page_title = wikidata_pages.item,
+          instance_of = wikidata_pages.instance_of
+      FROM wikidata_pages
+      WHERE wikipedia_article.language = wikidata_pages.language
+        AND wikipedia_article.title  = wikidata_pages.wp_page_title
+      ;" | psqlcmd
+
+echo "CREATE TABLE wikipedia_article_slim
+      AS
+      SELECT * FROM wikipedia_article
+      WHERE wikidata_id IS NOT NULL
+      ;" | psqlcmd
+
+echo "ALTER TABLE wikipedia_article
+      RENAME TO wikipedia_article_full
+      ;" | psqlcmd
+
+echo "ALTER TABLE wikipedia_article_slim
+      RENAME TO wikipedia_article
+      ;" | psqlcmd
+
+
+
+
+echo "====================================================================="
+echo "Dropping intermediate tables"
+echo "====================================================================="
+
+echo "DROP TABLE wikidata_place_dump;" | psqlcmd
+echo "DROP TABLE geo_earth_primary;" | psqlcmd
+for i in "${LANGUAGES[@]}"
+do
+    echo "DROP TABLE wikidata_${i}_pages;" | psqlcmd
+done