-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_wiki_markup.py
112 lines (104 loc) · 3.67 KB
/
parse_wiki_markup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#
# Copyright by Jussi Kujala ([email protected])
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
#
#
# Parses wikipedia dumps to "text-wiki" format.
# Output has one article per line, in format "title tab string-escaped article content"
# String-escaped means that control characters such as new line are written as \n
#
# usage: cat wiki.xml | python parse_wiki_markup.py > corpus.txt
#
# note: dividing program to functions is SO outdated :)
# note: many wiki markup constructs such as " or ‐ are not parsed, but output is an average quite good
# note: recursive [[ x | [[ y | z ]] ]] is not parsed correctly!
# todo: fix these remaining problems!
import cStringIO
import re
import sys
#used to discard some metadata
re_all = re.compile("&[lg]t;|\{\{|\}\}")
inside = False
title = None
text = cStringIO.StringIO()
for line in sys.stdin:
cont = False
if line.find('<title>') != -1 and line.find('</title>') != -1:
title = line[line.find('<title>') + 7 : line.find('</title>') ]
if line.find('<text') != -1:
inside = True
cont = True
if line.count('{{') > line.count('}}'):
text.write("{{")
if line.find('</text>') != -1:
if title != None and len(text.getvalue()) > 20:
# write article content
# discard all text between {{ and }}
# print text in [[ XXX | text ]]
# discard '
# discard between tags eg <center> </center>
# discard tags eg <center>
# note: recursive [[ x | [[ y | z ]] ]] is not parsed correctly!
# note: other commands such as " are not handled
text = text.getvalue().replace("'","")
sys.stdout.write(title+'\t')
text1 = cStringIO.StringIO()
lt_depth = 0 # strts <
block_depth = 0 #starts {{
while 1:
m = re_all.search(text)
if m == None:
if max(lt_depth,block_depth) == 0:
text1.write(text)
break
loc = m.start(0)
if max(lt_depth,block_depth) == 0 and not (text[loc+1]=="l" and text[loc+4] == "/"):
text1.write(text[0:loc])
if block_depth == 0 and text[loc+1] == "g":
lt_depth = max(0,lt_depth-1)
elif block_depth == 0 and text[loc+1] == "l":
lt_depth += 1
elif text[loc:loc+2] == "{{":
block_depth += 1
elif text[loc:loc+2] == "}}":
block_depth = max(0,block_depth -1)
text = text[m.end(0):]
text = text1.getvalue()
text1 = sys.stdout
while 1:
beg,sep,end = text.partition('[[')
text1.write(beg.encode("string-escape"))
if len(end) == 0:
break
else:
beg,sep,text = end.partition(']]')
if beg.find('|') != -1:
s = beg.split('|')[-1]
if s[0:2] == "[[":
text1.write(s[2:].encode("string-escape"))
else:
text1.write(s.encode("string-escape"))
elif len(beg) > 0:
text1.write(beg.encode("string-escape"))
sys.stdout.write('\n')
title = None
text = cStringIO.StringIO()
inside = False
if cont:
continue
if inside:
text.write(line)