-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathwp-extract.groovy
More file actions
executable file
·229 lines (194 loc) · 7.36 KB
/
wp-extract.groovy
File metadata and controls
executable file
·229 lines (194 loc) · 7.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/env groovy
import groovy.time.TimeCategory
import groovy.transform.Field
import groovy.xml.XmlSlurper
import groovy.xml.XmlUtil
import java.text.SimpleDateFormat
@Grab('com.opencsv:opencsv:4.5')
import com.opencsv.CSVWriter;
// configure settings here
@Field
final SEPARATOR = ","
@Field
final ENCODING = "UTF-8"
// Get the settings for this batch
@Field
def batch = ''
@Field
def contentRoot = ''
@Field
def dateFormat = ''
@Field
def assetsRoot = ''
@Field
def domain = ''
@Field
def template = ''
@Field
def download = true
@Field
def assetsCsv = null
@Field
def fileMappings = null
@Field
def replacements = null
@Field
def pageCsv = null
@Field
def targetDir = null
@Field
def filterXml = null
@Field
def replacementCfgFile = null
def start = new Date()
if(args.length < 2) {
println '\nUsage: groovy wp-preprocessor.groovy [wxr-file] [target]\n'
System.exit(1)
}
println "Pre-processing WordPress Export ${args[0]} to ${args[1]}"
targetDir = new File(args[1]);
batch = System.console().readLine("Batch ID (Wordpress): ") ?: "Wordpress"
contentRoot = System.console().readLine("Content Root (Required): ")
assert contentRoot
dateFormat = System.console().readLine("Post Date Format (ex: yyyy/MM/): ")
assetsRoot = System.console().readLine("Assets Root (Required): ")
assert assetsRoot
domain = System.console().readLine("Domain Name (Required): ")
assert domain
template = System.console().readLine("Migration Template (Required): ")
assert template
download = System.console().readLine("Download Attachments (Y/N): ").toUpperCase() == 'Y'
println "Loading into\n\tContent Root: ${contentRoot}\n\tAssets Root: ${assetsRoot}\nUsing\n\tDomain: ${domain}\n\tTemplate: ${template}"
assert System.console().readLine("Continue (Y/N): ").toUpperCase() == 'Y'
println 'Setting up folders...'
if(!targetDir.exists()){
targetDir.mkdirs()
}
new File(batch, targetDir).mkdirs();
println 'Writing data files...'
assetsCsv = new CSVWriter(new OutputStreamWriter(new FileOutputStream(new File("${batch}/asset-metadata.csv", targetDir)),ENCODING))
assetsCsv.writeNext(['assetPath','dc:title{{String}}','dc:description{{String}}'] as String[])
fileMappings = new CSVWriter(new OutputStreamWriter(new FileOutputStream(new File("${batch}/file-mappings.csv", targetDir)),ENCODING))
fileMappings.writeNext(['Status','Source','Target'] as String[])
replacements = new CSVWriter(new OutputStreamWriter(new FileOutputStream(new File("${batch}/replacements.csv", targetDir)),ENCODING))
replacements.writeNext(['Status','Source','Target'] as String[])
pageCsv = new CSVWriter(new OutputStreamWriter(new FileOutputStream(new File("${batch}/page-mappings.csv", targetDir)),ENCODING))
pageCsv.writeNext(['Status','Source Path','New Url','New Path','Template','Legacy Url','Redirects','Subnav Root?','Page Title','Page Description','Batch'] as String[])
filterXml = new File("${batch}/filter.xml", targetDir)
updateFilterXml()
replacementCfgFile = new File("${batch}/replacement-config.json", targetDir)
updateReplacementCfgFile()
println "Parsing ${args[0]}..."
def inXml = new XmlSlurper(false,true).parseText(new File(args[0]).getText(ENCODING))
void updateFilterXml() {
def filterCfg = '''<?xml version="1.0" encoding="UTF-8"?>
<workspaceFilter version="1.0">
<filter root="${{contentRootPath}}">
<exclude pattern="${{contentRootPath}}/jcr:content" />
</filter>
<filter root="${{contentDamRoot}}"/>
</workspaceFilter>
'''
filterCfg = filterCfg.replace('${{contentRootPath}}', contentRoot)
filterCfg = filterCfg.replace('${{contentDamRoot}}', assetsRoot)
filterXml.write(filterCfg)
}
void updateReplacementCfgFile() {
def replacementCfg = '''{
\t"replacements.csv": [{
\t\t"mode": "mapping",
\t\t"sourceKey": "Source",
\t\t"targetKey": "Target"
\t}],
\t"page-mappings.csv": [
\t\t{
\t\t\t"mode": "mapping",
\t\t\t"sourceKey": "Legacy Url",
\t\t\t"targetKey": "New Url"
\t\t},
\t\t{
\t\t\t"mode": "mapping",
\t\t\t"sourceKey": "Source Path",
\t\t\t"targetKey": "New Url"
\t\t}
\t]
}
'''
replacementCfgFile.write(replacementCfg)
}
void downloadFile(String url, String localPath) {
def file = new File("work/source/${localPath}", targetDir)
println "Downloading ${url} to ${file}"
println "Creating parent folder..."
file.getParentFile().mkdirs()
println "Writing to file: ${file}"
file.withOutputStream { stream ->
file << fetch(url)
}
}
InputStream fetch(String url){
def get = new URL(url).openConnection()
get.setRequestProperty('User-Agent', 'curl/7.35.0')
def rc = get.getResponseCode()
if(rc == 200){
return get.getInputStream()
}
println "Retrieved invalid response code ${rc} from ${url}"
return null
}
void handleAttachment(Object item){
println "Handling attachment ${item.post_name}"
def oldPath = item.attachment_url.text().replace(domain,'')
def newPath = "${assetsRoot}${oldPath.replace('wp-content/uploads/','')}"
if(download){
downloadFile(item.attachment_url.text(), oldPath)
}
println "Adding entry ${[newPath,item.title.text().replaceAll("[\n\r]", "").trim(),item.encoded.text().trim()]} to asset-metadata.csv for ${oldPath}"
assetsCsv.writeNext([newPath,item.title.text().replaceAll("[\n\r]", "").trim(),item.encoded.text().trim()] as String[])
assetsCsv.flush()
println "Adding entry to file-mappings.csv for ${oldPath}"
fileMappings.writeNext(['Migrate',oldPath,newPath] as String[])
fileMappings.flush()
println "Adding entry to replacements.csv for ${item.post_id}"
replacements.writeNext(['Migrate','wp-image-'+item.post_id,newPath] as String[])
replacements.writeNext(['Migrate',item.attachment_url.text(),newPath] as String[])
replacements.flush()
}
void handlePost(Object item) {
println "Handling post ${item.post_name}"
if(item.status.text() == 'draft'){
println "Skipping ${item.post_name} as it is in draft..."
return
}
def itemFile = new File("work/source/${item.post_name}.xml", targetDir)
itemFile.getParentFile().mkdirs()
println "Saving item to: ${itemFile}..."
XmlUtil xmlUtil = new XmlUtil()
xmlUtil.serialize(item, new FileWriter(itemFile))
println "Adding line to page mappings for: ${item.post_name}.xml"
def date = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(item.post_date.text())
def path ="${contentRoot}/${new SimpleDateFormat(dateFormat).format(date)}${item.post_name}"
println "Mapping to new path: ${path}..."
pageCsv.writeNext(['Migrate',"${item.post_name}.xml","${path}.html",path,template,item.link.text().replace(domain,''),'','No',item.title.text().replaceAll("[\n\r]", "").trim(),item.encoded[1].text().replaceAll("[\n\r]", "").trim(),batch] as String[])
pageCsv.flush()
}
def attachments = 0
def posts = 0
def other = 0
inXml.channel.item.each{ item ->
println "Handling item of type ${item['post_type'].text()}..."
if('attachment'.equals(item.post_type.text())) {
handleAttachment(item)
attachments++
} else if ('post'.equals(item.post_type.text()) || 'page'.equals(item.post_type.text())) {
handlePost(item)
posts++
} else {
println "Unable to handle ${item.post_type}!!"
other++
}
}
if (other > 0) {
println "${other} unknown files ignored!"
}
println "${posts} posts and ${attachments} attachments downloaded in ${TimeCategory.minus(new Date(), start)}"