July wrapup (#77)

* assume postgresql already installed * update timing information
osm-search · Aug 1, 2023 · e08b5a8 · e08b5a8
1 parent 44754e6
commit e08b5a8
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ in the results match the search terms).
 
 Wikipedia publishes [dumps](https://meta.wikimedia.org/wiki/Data_dumps) of their databases once per month.
 
-To run one build you need 160GB of disc space (of which 105GB Postgresql database). The scripts process
+To run one build you need 150GB of disc space (of which 90GB Postgresql database). The scripts process
 39 languages and output 4 files. Runtime is approximately 9 hours on a 4 core, 4GB RAM machine with SSD
 discs.
 
@@ -243,7 +243,7 @@ the `languages.txt` file to only run a small language (e.g. Bulgarian) first.
 1. latest\_available\_data
 
    Prints a date. Wikipedia exports take many days, then mirrors are sometimes slow copying them. It's not
-uncommon for an export starting Jan/1st to only be full ready Jan/20th.
+uncommon for an export starting Jan/1st to only be full ready Jan/10th or later.
 
 2. wikipedia_download (1h)
 
@@ -257,7 +257,7 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
 
    Runs 300 SPARQL queries against wikidata servers. Output is 5GB.
 
-5. wikipedia_sql2csv (4:00h)
+5. wikipedia_sql2csv (4:20h)
    
    The MySQL SQL files get parsed sequentially and we try to exclude as much data (rows,
    columns) as possible. Output is 75% smaller than input. Any work done here cuts
@@ -295,14 +295,13 @@ uncommon for an export starting Jan/1st to only be full ready Jan/20th.
    enredirect         |  10,804,606 rows | 599 MB
    ```
 
-8. wikipedia\_process, wikidata\_process (3:00h)
+8. wikipedia\_process, wikidata\_process (2:30h)
 
    Postgresql is great joining large datasets together, especially if not all
    data fits into RAM.
 
    ```
-   set counts                                             (0:15h)
-   set othercounts                                        (2:30h)
+   set othercounts                                        (2:20h)
    Create and fill wikipedia_article_full                 (0.03h)
    Create derived tables                                  (0.03h)
    Process language pages                                 (0.03h)

diff --git a/complete_run.sh b/complete_run.sh
@@ -20,7 +20,6 @@ export BUILDID=wikimedia_build_$(date +"%Y%m%d")
 export LANGUAGES=$(grep -v '^#' config/languages.txt | tr "\n" ",")
 # export LANGUAGES=de,nl
 export DATABASE_NAME=$BUILDID
-export DATABASE_TABLESPACE=extraspace # default is pg_default
 
 ./steps/wikipedia_download.sh
 ./steps/wikidata_download.sh
@@ -30,7 +29,7 @@ export DATABASE_TABLESPACE=extraspace # default is pg_default
 ./steps/wikidata_sql2csv.sh
 
 # dropdb --if-exists $DATABASE_NAME
-createdb --tablespace=$DATABASE_TABLESPACE $DATABASE_NAME
+createdb $DATABASE_NAME
 ./steps/wikipedia_import.sh
 ./steps/wikidata_import.sh
 

diff --git a/install_dependencies.sh b/install_dependencies.sh
@@ -7,25 +7,6 @@
 sudo apt-get install -y postgresql-14
 sudo -u postgres createuser -s $USER
 
-# The database can be 100GB or more. If you want to create it on a separate
-# drive you can try:
-#
-# sudo -u postgres psql -c 'SELECT * FROM pg_tablespace;'
-# # oid  |  spcname   | spcowner | spcacl | spcoptions
-# #------+------------+----------+--------+------------
-# # 1663 | pg_default |       10 |        |
-# # 1664 | pg_global  |       10 |        |
-#
-# EXTRASPACE_PATH=/mnt/HC_Volume_21300566/postgres-data
-# sudo mkdir -p $EXTRASPACE_PATH
-# sudo chown postgres $EXTRASPACE_PATH
-# sudo chgrp postgres $EXTRASPACE_PATH
-#
-# sudo -u postgres psql -c "CREATE TABLESPACE extraspace LOCATION '$EXTRASPACE_PATH';"
-# sudo -u postgres psql -c 'SELECT * FROM pg_tablespace;'
-
-
-
 sudo apt-get install -y wget coreutils nodejs jq moreutils pigz
 sudo apt-get install -y python3-dev python3-pip python3-setuptools build-essential
 

diff --git a/steps/wikipedia_sql2csv.sh b/steps/wikipedia_sql2csv.sh
@@ -35,11 +35,11 @@ do
     # Only interested in page_namespace == 0 (articles)
     # English wikipedia:
     #   input 1.9GB compressed
-    #   output 190MB compressed
+    #   output 200MB compressed
     # Output columns: page_id, page_title
 
     unpigz -c $DOWNLOADED_PATH/$LANG/page.sql.gz | \
-    ./bin/mysqldump_to_csv.py | \
+    bin/mysqldump_to_csv.py | \
     bin/filter_page.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/pages.csv.gz
 
@@ -53,13 +53,13 @@ do
     #
     # Only interested in pl_namespace == 0 (articles)
     # English wikipedia:
-    #   input 6.8GB compressed (54GB uncompressed)
-    #   output 450MB compressed (3.1GB uncompressed)
+    #   input 6.8GB compressed
+    #   output 200MB compressed
     # Output columns: pl_title, count
 
     unpigz -c $DOWNLOADED_PATH/$LANG/pagelinks.sql.gz | \
-    ./bin/mysqldump_to_csv.py | \
-    /usr/bin/time -v bin/filter_pagelinks.py | \
+    bin/mysqldump_to_csv.py | \
+    bin/filter_pagelinks.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/pagelinks.csv.gz
 
 
@@ -73,10 +73,10 @@ do
     # Output is sorted by lang
     # English wikipedia:
     #   input 400MB compressed (1.5GB uncompressed)
-    #   output 380MB compressed (1.3GB uncompressed)
+    #   output 310MB compressed (1.3GB uncompressed)
 
     unpigz -c $DOWNLOADED_PATH/${LANG}/langlinks.sql.gz | \
-    ./bin/mysqldump_to_csv.py | \
+    bin/mysqldump_to_csv.py | \
     bin/filter_langlinks.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/langlinks.csv.gz
 
@@ -93,10 +93,10 @@ do
     # Output columns: rd_from_page_id, rd_title
     # English wikipedia:
     #   input 140MB compressed (530MB uncompressed)
-    #   output 100MB compressed (300MB uncompressed)
+    #   output 120MB compressed (300MB uncompressed)
 
     unpigz -c $DOWNLOADED_PATH/$LANG/redirect.sql.gz | \
-    ./bin/mysqldump_to_csv.py | \
+    bin/mysqldump_to_csv.py | \
     bin/filter_redirect.py | \
     pigz -9 > $CONVERTED_PATH/$LANG/redirect.csv.gz