Skip to content

Commit 781a975

Browse files
committed
Add support for Commons files
Rewritten in C++ because the C++ library BBHash helps avoid loading entire keyfile into RAM.
1 parent 1ded838 commit 781a975

10 files changed

+1928
-109
lines changed

.gitignore

+2-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,2 @@
1-
commonscats-in-commons.txt
2-
commonscats-in-osm.xml
3-
*.gz
4-
*.geojson
5-
*.tsv
6-
*.pbf
1+
data/*
2+
out/*

LICENSE

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
Copyright (c) 2023 Lennard Hofmann
1+
Copyright (c) 2015 Guillaume Rizk (BBHash)
2+
Copyright (c) 2023 Lennard Hofmann (CommonsChecker4OSM)
23

34
Permission is hereby granted, free of charge, to any person obtaining a copy
45
of this software and associated documentation files (the "Software"), to deal

Makefile

+41-22
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,53 @@
1-
.PHONY: geojson
2-
geojson: out.geojson out-maproulette.geojson
1+
yesterday != date -d yesterday +"%Y%m%d"
32

4-
.PHONY: tsv
5-
tsv: out.tsv
3+
CXX ?= g++
4+
CFLAGS = -O3 -std=c++17 -lpthread
65

7-
planet.pbf:
8-
@echo "Missing planet.pbf: See https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading for how to download an Osmium-compatible planet file." >&2; exit 1
9-
# You could add a command here to download planet.pbf with a BitTorrent client
6+
main: src/main.cpp src/BooPHF.h
7+
$(CXX) -o $@ $< $(CFLAGS)
108

11-
commonswiki-all-titles.gz: FORCE
12-
curl -z commonswiki-all-titles.gz https://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-all-titles.gz -o commonswiki-all-titles.gz
139

14-
commonscats-in-commons.txt: commonswiki-all-titles.gz
10+
### NECESSARY INTERMEDIATE FILES: --------------------------------
11+
12+
data/planet.pbf:
13+
@echo "Missing data/planet.pbf: See https://wiki.openstreetmap.org/wiki/Planet.osm#Downloading for how to download an Osmium-compatible planet file." >&2; exit 1
14+
# You could add a command here to download data/planet.pbf with a BitTorrent client
15+
16+
data/commonswiki-all-titles.gz: FORCE
17+
curl -z data/commonswiki-all-titles.gz https://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-all-titles.gz -o data/commonswiki-all-titles.gz
18+
19+
data/commonswiki-cats.txt: data/commonswiki-all-titles.gz
1520
# namespace 14 is "Category:"
16-
zcat commonswiki-all-titles.gz | grep ^14 | cut -f2 > commonscats-in-commons.txt
21+
zcat data/commonswiki-all-titles.gz | grep ^14 | cut -f2 > data/commonswiki-cats.txt
22+
23+
data/commonswiki-files.txt.gz: FORCE
24+
curl -z data/commonswiki-files.txt.gz https://dumps.wikimedia.org/other/mediatitles/$(yesterday)/commonswiki-$(yesterday)-all-media-titles.gz -o data/commonswiki-files.txt.gz
25+
26+
data/commonswiki-files.txt: data/commonswiki-files.txt.gz
27+
gunzip --keep data/commonswiki-files.txt.gz
28+
29+
data/planet-filtered.tsv: data/planet.pbf
30+
osmium tags-filter -R data/planet.pbf 'nwr/wikimedia_commons' -f xml | ./xml2tsv.py > data/planet-filtered.tsv
31+
32+
data/planet-filtered.geojson: data/planet.pbf
33+
osmium tags-filter -t data/planet.pbf 'nwr/wikimedia_commons' -o data/planet-filtered.pbf --overwrite
34+
osmium export data/planet-filtered.pbf -c src/config.json -o data/planet-filtered.geojson -f jsonseq --overwrite
35+
1736

18-
commonscats-in-osm.xml: planet.pbf
19-
osmium tags-filter -R planet.pbf 'nwr/wikimedia_commons=Category:*' -o commonscats-in-osm.xml --overwrite
37+
### OUTPUT FILES: ------------------------------------------------
2038

21-
planet-filtered.geojson: planet.pbf
22-
osmium tags-filter -t planet.pbf 'nwr/wikimedia_commons=Category:*' -o planet-filtered.pbf --overwrite
23-
osmium export planet-filtered.pbf -c config.json -o planet-filtered.geojson -f jsonseq --overwrite
39+
out/cats.tsv: main data/commonswiki-cats.txt data/planet-filtered.tsv
40+
# ./main.py Category: data/commonswiki-cats.txt data/planet-filtered.tsv > out/cats.tsv
41+
./main Category: $$(wc -l data/commonswiki-cats.txt) data/planet-filtered.tsv $$(nproc) > out/cats.tsv
2442

25-
out.tsv: commonscats-in-osm.xml commonscats-in-commons.txt
26-
./main.py out.tsv
43+
out/cats.geojson: main data/commonswiki-cats.txt data/planet-filtered.geojson
44+
# ./main.py Category: data/commonswiki-cats.txt data/planet-filtered.geojson > out/cats.geojson
45+
./main Category: $$(wc -l data/commonswiki-cats.txt) data/planet-filtered.geojson $$(nproc) > out/cats.geojson
2746

28-
out.geojson: planet-filtered.geojson commonscats-in-commons.txt
29-
./main.py out.geojson
47+
out/files.tsv: main data/commonswiki-files.txt data/planet-filtered.tsv
48+
./main File: $$(wc -l data/commonswiki-files.txt) data/planet-filtered.tsv $$(nproc) > out/files.tsv
3049

31-
out-maproulette.geojson: out.geojson
32-
./to_maproulette.py out.geojson > out-maproulette.geojson
50+
out/files.geojson: main data/commonswiki-files.txt data/planet-filtered.geojson
51+
./main File: $$(wc -l data/commonswiki-files.txt) data/planet-filtered.geojson $$(nproc) > out/files.geojson
3352

3453
FORCE: ;

README.md

+53-28
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,78 @@
1-
# CommonsCatOSM
1+
# CommonsChecker4OSM
22

3-
Finds OpenStreetMap elements tagged with `wikimedia_commons=Category:*` where the category does not exist on Wikimedia Commons.
3+
Finds OpenStreetMap elements tagged with `wikimedia_commons=Category:*` (or `wikimedia_commons=File:*`) where the category (or file) does not exist on Wikimedia Commons.
4+
5+
## How does it work
6+
7+
CommonsChecker4OSM filters a list of OSM elements tagged wtih `wikimedia_commons=*` against a list of valid Commons categories or files. It is basically an `fgrep -f` implementation that processes a 5 gigabyte pattern file efficiently using the [BBHash](https://github.com/rizkg/BBHash) minimal perfect hashing library. A Makefile is provided to download and prepare the input files. A Python script [main.py](./main.py) is included that may help you to understand the equivalent C++ code, but it loads the entire pattern file into RAM.
48

59
## How to use
610

7-
To minimize false positives, wait until Wikimedia releases a new data dump (usually at the third day of every month). You can get notified by subscribing to [the RSS feed](https://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-all-titles.gz-rss.xml).
11+
If you want to find invalid categories, you should wait until Wikimedia releases a new data dump (usually at the third day of every month) to minimize false positives. You can get notified by subscribing to [the RSS feed](https://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-all-titles.gz-rss.xml). If you want to find invalid files, you do not need to wait—the data dump is released every day.
12+
13+
CommonsChecker4OSM has only been tested on Linux. It should work on Windows, but preparing the input files will be annoying. Step-by-step guide:
814

915
1. Install [Osmium Tool](https://osmcode.org/osmium-tool/)
1016
2. Clone this repository
11-
3. Download required files into the same directory:
17+
3. Inside cloned repo, create `data` and `out` folders
18+
3. Download required files into the `data` directory:
1219
* `planet.pbf`: an Osmium-compatible [OpenStreetMap planet](https://wiki.openstreetmap.org/wiki/Planet.osm)
13-
* [commonswiki-latest-all-titles.gz](https://dumps.wikimedia.org/commonswiki/latest/commonswiki-latest-all-titles.gz) (1.1 GB as of 2023)
14-
4. Choose an output format (see below). If you use a Unix-like operating system, you can run `make tsv` or `make geojson`. Otherwise, run the commands given below.
20+
4. Choose an output format (see below). Run either `make out/cats.tsv`, `make out/cats.geojson`, `make out/files.tsv`, or `make out/files.geojson` (output filename must be one of these hardcoded strings).
1521

1622
## Output formats
1723

1824
### Tab-separated values (TSV)
1925

20-
Output contains invalid category names and the OpenStreetMap ID of the node, way, or relation with that category. This is the fastest format to produce.
26+
Output contains OSM identifier (node, way, or relation ID) and `wikimedia_commons` value, separated by a tab. This is the fastest format to produce.
2127

2228
```console
23-
# Make sure you have 10min to spare
24-
$ osmium tags-filter -R planet.pbf 'nwr/wikimedia_commons=Category:*' -o commonscats-in-osm.xml
25-
$ python main.py out.tsv
26-
$ cat out.tsv
27-
n/1573735855 Conservatoire_National_de_V%C3%A9hicules_Historiques
28-
n/286133524 ref:sprockhoff No. 465
29-
n/3022117073 Wildwiesenwarte;Category:Views from the Wildwiesenwarte
30-
n/306593910 Prince George pub, Brighton Good pictures Advanced...
31-
n/6426478285 Dorfkirche_Mechow_(Kyritz)?uselang=de
32-
w/297069904 https://commons.wikimedia.org/wiki/Category:Gr%C3%BCner_Graben_14_(G%C3%B6rlitz)
33-
w/320276921 Ballyellen Upper Lock
34-
w/474166824 Nages-et-Solorgues#/media/File:Fontaine_Ranquet.jpg
35-
r/12931220 Brandenburger Straße 36;Riedelsberger Weg 2 (Bayreuth)
29+
$ make out/cats.tsv
30+
# osmium takes 10min for entire planet
31+
$ cat out/cats.tsv
32+
n/1573735855 Category:Conservatoire_National_de_V%C3%A9hicules_Historiques
33+
n/286133524 Category:ref:sprockhoff No. 465
34+
n/3022117073 Category:Wildwiesenwarte;Category:Views from the Wildwiesenwarte
35+
n/306593910 Category:Prince George pub, Brighton Good pictures Advanced...
36+
n/6426478285 Category:Dorfkirche_Mechow_(Kyritz)?uselang=de
37+
w/297069904 Category:https://commons.wikimedia.org/wiki/Category:Gr%C3%BCner_Graben_14_(G%C3%B6rlitz)
38+
w/320276921 Category: Ballyellen Upper Lock
39+
w/474166824 Category:Nages-et-Solorgues#/media/File:Fontaine_Ranquet.jpg
40+
r/12931220 Category:Brandenburger Straße 36;Riedelsberger Weg 2 (Bayreuth)
3641
```
3742

3843
### Line-by-Line GeoJSON
3944

4045
[This format](https://learn.maproulette.org/documentation/line-by-line-geojson/) can be used to create a challenge on maproulette.org. It might lack a few categories that are present in the TSV format.
4146

4247
```console
43-
# Make sure you have 3GB RAM and 25min to spare
44-
$ osmium tags-filter -t planet.pbf 'nwr/wikimedia_commons=Category:*' -o planet-filtered.pbf
45-
$ osmium export planet-filtered.pbf -c config.json -o planet-filtered.geojson -f jsonseq
46-
$ python main.py out.geojson
47-
$ cat out.geojson
48-
{"type":"Feature","geometry":{"type":"Point","coordinates":[-2.0835284,53.3600557]},"properties":{"@type":"node","@id":29947059,"wikimedia_commons":"Category:Help Category:Middlewood railway station"}}
49-
{"type":"Feature","geometry":{"type":"LineString","coordinates":[[10.1212064,54.3247979],[10.120334100000001,54.3242942],[10.1192733,54.3236981],[10.1199922,54.3233703],[10.1204628,54.3231298],[10.1209243,54.3228965],[10.1222211,54.3236181],[10.1212064,54.3247979]]},"properties":{"@type":"way","@id":9408975,"wikimedia_commons":"Category:Wilhelmplatz (Kiel)"}}
50-
$ python to_maproulette.py out.geojson
48+
$ make out/cats.geojson
49+
# osmium takes 22min for entire planet
50+
$ cat cats.geojson
5151
{"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"Point","coordinates":[-2.0835284,53.3600557]},"properties":{"@type":"node","@id":29947059,"wikimedia_commons":"Category:Help Category:Middlewood railway station"}}]}
5252
{"type":"FeatureCollection","features":[{"type":"Feature","geometry":{"type":"LineString","coordinates":[[10.1212064,54.3247979],[10.120334100000001,54.3242942],[10.1192733,54.3236981],[10.1199922,54.3233703],[10.1204628,54.3231298],[10.1209243,54.3228965],[10.1222211,54.3236181],[10.1212064,54.3247979]]},"properties":{"@type":"way","@id":9408975,"wikimedia_commons":"Category:Wilhelmplatz (Kiel)"}}]}
5353
```
54+
55+
## License
56+
57+
CommonsChecker4OSM and BBHash are licensed under the MIT license:
58+
59+
Copyright (c) 2015 Guillaume Rizk (BBHash)
60+
Copyright (c) 2023 Lennard Hofmann (CommonsChecker4OSM)
61+
62+
Permission is hereby granted, free of charge, to any person obtaining a copy
63+
of this software and associated documentation files (the "Software"), to deal
64+
in the Software without restriction, including without limitation the rights
65+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
66+
copies of the Software, and to permit persons to whom the Software is
67+
furnished to do so, subject to the following conditions:
68+
69+
The above copyright notice and this permission notice shall be included in all
70+
copies or substantial portions of the Software.
71+
72+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
73+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
74+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
75+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
76+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
77+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
78+
SOFTWARE.

main.py

+47-43
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,64 @@
11
#!/usr/bin/env python3
2-
import xml.etree.ElementTree as ET
3-
import gzip
42
import sys
53
import json
64

7-
VALID_CATS = "commonscats-in-commons.txt"
8-
9-
def load_valid_categories():
10-
cats = set()
11-
try:
12-
with open(VALID_CATS) as f:
13-
for line in f:
14-
cats.add(line.rstrip('\n'))
15-
except FileNotFoundError:
16-
with gzip.open("commonswiki-latest-all-titles.gz", "rt") as f_in, open(VALID_CATS, "w") as f_out:
17-
for line in f_in:
18-
ns, category_name = line.split('\t', 1)
19-
if ns == '14': # "Category:" namespace
20-
cats.add(category_name.rstrip('\n'))
21-
f_out.write(category_name)
22-
return cats
23-
24-
25-
def tsv(cats, outfile):
26-
with open(outfile, "w") as f:
27-
tree = ET.parse("commonscats-in-osm.xml")
28-
for elem in tree.findall(".//*tag[@k='wikimedia_commons']/.."):
29-
wikimedia_commons = elem.find("tag[@k='wikimedia_commons']").get('v').removeprefix("Category:")
30-
if wikimedia_commons.replace(' ', '_') not in cats:
31-
f.write(f"{elem.tag[:1]}/{elem.get('id')}\t{wikimedia_commons}\n")
32-
33-
34-
def geojson(cats, outfile):
35-
with open(outfile, "wb") as f:
36-
for line in open("planet-filtered.geojson", "rb"):
37-
wikimedia_commons = json.loads(line[1:])["properties"]["wikimedia_commons"]
38-
if wikimedia_commons.removeprefix("Category:").replace(' ', '_') not in cats:
39-
f.write(line)
5+
class InvalidCommonsTagDetector:
6+
def __init__(self, prefix, keyfile):
7+
self.prefix = prefix
8+
with open(keyfile) as f:
9+
self.titles = {line for line in f}
10+
11+
def detect(self, str):
12+
return (str.startswith(self.prefix) and
13+
str.removeprefix(self.prefix).replace(' ', '_') not in self.titles)
14+
15+
16+
def print_tsv(detector: InvalidCommonsTagDetector, tsvfile):
17+
for line in open(tsvfile, "r"):
18+
fields = line.split('\t')
19+
if len(fields) != 2:
20+
sys.stderr.write("malformed input line: " + line)
21+
continue
22+
if detector.detect(fields[1]):
23+
sys.stdout.write(line)
24+
25+
26+
def print_geojson(detector: InvalidCommonsTagDetector, geojsonfile):
27+
for line in open(geojsonfile, "rb"):
28+
if detector.detect(json.loads(line[1:])["properties"]["wikimedia_commons"] + '\n'):
29+
# MapRoulette-friendly output format:
30+
sys.stdout.buffer.write(b'\x1e{"type":"FeatureCollection","features":[%s]}\n'
31+
% line[1:].rstrip(b'\n'))
4032

4133

4234
def usage():
43-
print("""Usage: ./main.py outfile.geojson
44-
./main.py outfile.tsv""",
35+
print("""Usage: ./main.py <prefix> <keyfile> <osmfile>
36+
37+
Arguments:
38+
<prefix>: namespace prefix (e.g. "Category:" or "File:")
39+
<keyfile>: list of MediaWiki page titles (one per line)
40+
<osmfile>: .tsv or .geojson file generated with make
41+
42+
Example:
43+
./main.py Category: data/commonswiki-cats.txt data/planet-filtered.tsv""",
4544
file=sys.stderr)
4645
exit(1)
4746

4847

4948
def main():
50-
if len(sys.argv) != 2:
49+
if len(sys.argv) != 4:
5150
usage()
5251

53-
outfile = sys.argv[1]
54-
if outfile.endswith(".tsv"):
55-
tsv(load_valid_categories(), outfile)
56-
elif outfile.endswith(".geojson"):
57-
geojson(load_valid_categories(), outfile)
52+
prefix = sys.argv[1]
53+
keyfile = sys.argv[2]
54+
osmfile = sys.argv[3]
55+
56+
if osmfile.endswith(".tsv"):
57+
detector = InvalidCommonsTagDetector(prefix, keyfile)
58+
print_tsv(detector, osmfile)
59+
elif osmfile.endswith(".geojson"):
60+
detector = InvalidCommonsTagDetector(prefix, keyfile)
61+
print_geojson(detector, osmfile)
5862
else:
5963
usage()
6064

0 commit comments

Comments
 (0)