Skip to content

Commit

Permalink
new config/ directory, subdivide downloaded/ directory (#19)
Browse files Browse the repository at this point in the history
  • Loading branch information
mtmail authored Jul 3, 2022
1 parent e893273 commit c4d925d
Show file tree
Hide file tree
Showing 10 changed files with 34 additions and 17 deletions.
8 changes: 5 additions & 3 deletions complete_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@

./install_dependencies.sh

export BUILDID=wiki_build_202207
export LANGUAGES=$(grep -v '^#' languages.txt | tr "\n" ",")
export BUILDID=wiki_build_20220620
export LANGUAGES=$(grep -v '^#' config/languages.txt | tr "\n" ",")
export DATABASE_NAME=wikiprocessingdb

./steps/wikipedia_download.sh
./steps/wikidata_download.sh
./steps/wikidata_api_fetch_placetypes.sh

# dropdb wikiprocessingdb
# dropdb wikiprocessingdb $DATABASE_NAME
createdb wikiprocessingdb $DATABASE_NAME
./steps/wikipedia_import.sh
./steps/wikidata_import.sh

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# Tested on Ubuntu-20
#

sudo apt-get install -y wget perl
sudo apt-get install -y wget perl coreutils
25 changes: 18 additions & 7 deletions steps/wikidata_api_fetch_placetypes.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#!/bin/bash

# set defaults
: ${BUILDID:=latest}

DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
TEMP_PATH=$DOWNLOADED_PATH/tmp

echo "====================================================================="
echo "Get wikidata places from wikidata query API"
echo "====================================================================="
Expand Down Expand Up @@ -50,9 +56,11 @@ echo "====================================================================="
# https://ja.wikipedia.org/wiki/%E3%82%81%E3%81%8C%E3%81%B2%E3%82%89%E3%82%B9%E3%82%AD%E3%83%BC%E5%A0%B4
# so we leave them in.

mkdir -p $DOWNLOADED_PATH

echo "Number of place types:"
wc -l wikidata_place_types.txt
echo '' > wikidata_place_dump.csv
wc -l config/wikidata_place_types.txt
echo '' > $DOWNLOADED_PATH/wikidata_place_dump.csv

while read PT_LINE ; do
QID=$(echo $PT_LINE | sed 's/;.*//' )
Expand Down Expand Up @@ -89,8 +97,8 @@ while read PT_LINE ; do
grep -e "[[:space:]]0$" | \
cut -f2 | \
sort | \
awk -v qid=$QID '{print $0 ","qid}' > $QID.csv
wc -l $QID.csv
awk -v qid=$QID '{print $0 ","qid}' > $TEMP_PATH/QID.csv
wc -l $TEMP_PATH/$QID.csv

# output example:
# Q97774986,Q130003
Expand All @@ -100,6 +108,9 @@ while read PT_LINE ; do
# Q992902,Q130003
# Q995986,Q130003

cat $QID.csv >> wikidata_place_dump.csv
rm $QID.csv
done < wikidata_place_types.txt
cat $TEMP_PATH/QID.csv >> $DOWNLOADED_PATH/wikidata_place_dump.csv
rm $TEMP_PATH/$QID.csv
done < config/wikidata_place_types.txt

cp config/wikidata_place_types_levels.txt $DOWNLOADED_PATH
rmdir $TEMP_PATH
3 changes: 2 additions & 1 deletion steps/wikidata_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ echo "====================================================================="
: ${WIKIDATA_DATE:=20220620}


DOWNLOADED_PATH="$BUILDID/downloaded"
DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"

download() {
if [ -e "$2" ]; then
Expand All @@ -28,6 +28,7 @@ download() {
rm -f "$2"
exit 1
fi
du -h "$2" | cut -f1
}

# 114M downloaded/geo_tags.sql.gz
Expand Down
8 changes: 5 additions & 3 deletions steps/wikidata_import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
: ${BUILDID:=latest}
: ${DATABASE_NAME:=wikiprocessingdb}

DOWNLOADED_PATH="$BUILDID/downloaded"
DOWNLOADED_PATH="$BUILDID/downloaded/wikidata"
# postgresql's COPY requires full path
DOWNLOADED_PATH_ABS=$(realpath "$DOWNLOADED_PATH")

psqlcmd() {
psql --quiet $DATABASE_NAME
Expand Down Expand Up @@ -44,7 +46,7 @@ echo "CREATE TABLE wikidata_place_dump (
);" | psqlcmd

echo "COPY wikidata_place_dump (item, instance_of)
FROM '$PWD/wikidata_place_dump.csv'
FROM '$DOWNLOADED_PATH_ABS/wikidata_place_dump.csv'
DELIMITER ','
CSV
;" | psqlcmd
Expand All @@ -55,7 +57,7 @@ echo "CREATE TABLE wikidata_place_type_levels (
);" | psqlcmd

echo "COPY wikidata_place_type_levels (place_type, level)
FROM '$PWD/wikidata_place_type_levels.csv'
FROM '$DOWNLOADED_PATH_ABS/wikidata_place_type_levels.csv'
DELIMITER ','
CSV
HEADER
Expand Down
3 changes: 2 additions & 1 deletion steps/wikipedia_download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))
: ${WIKIPEDIA_DATE:=20220620}


DOWNLOADED_PATH="$BUILDID/downloaded"
DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia"


download() {
Expand All @@ -32,6 +32,7 @@ download() {
rm -f "$2"
exit 1
fi
du -h "$2" | cut -f1
}

for LANG in "${LANGUAGES_ARRAY[@]}"
Expand Down
2 changes: 1 addition & 1 deletion steps/wikipedia_import.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
: ${LANGUAGES:=bar,cy}
LANGUAGES_ARRAY=($(echo $LANGUAGES | tr ',' ' '))

DOWNLOADED_PATH="$BUILDID/downloaded"
DOWNLOADED_PATH="$BUILDID/downloaded/wikipedia"

psqlcmd() {
psql --quiet $DATABASE_NAME |& \
Expand Down

0 comments on commit c4d925d

Please sign in to comment.