Skip to content
This repository was archived by the owner on Feb 27, 2023. It is now read-only.

Commit c7e5812

Browse files
author
Scott Stafford
committed
Merge branch '279-parseContent' into dev
2 parents 47cff29 + 057c016 commit c7e5812

File tree

5 files changed

+91
-1
lines changed

5 files changed

+91
-1
lines changed

file/build.gradle

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ apply plugin: "com.jfrog.bintray"
22

33
dependencies {
44
compile "org.springframework.batch:spring-batch-core:$springBatchVersion"
5-
testCompile "junit:junit:4.12"
5+
compile "com.marklogic:marklogic-client-api:4.0.3"
6+
compile "org.apache.tika:tika-parsers:1.17"
7+
compile project(":infrastructure")
68
}
79

810
ext {
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
package com.marklogic.spring.batch.item.file;
2+
3+
import com.marklogic.client.io.InputSourceHandle;
4+
import com.marklogic.client.io.marker.AbstractWriteHandle;
5+
import com.marklogic.spring.batch.item.file.support.TikaParser;
6+
import com.marklogic.spring.batch.item.processor.AbstractMarkLogicItemProcessor;
7+
import com.marklogic.spring.batch.item.processor.support.UriGenerator;
8+
import org.springframework.core.io.Resource;
9+
import org.xml.sax.InputSource;
10+
11+
import java.io.StringReader;
12+
13+
public class TikaParserItemProcessor extends AbstractMarkLogicItemProcessor<Resource> {
14+
15+
public TikaParserItemProcessor() {
16+
super();
17+
}
18+
19+
public TikaParserItemProcessor(UriGenerator uriGenerator) {
20+
super(uriGenerator);
21+
}
22+
23+
@Override
24+
public AbstractWriteHandle getContentHandle(Resource item) throws Exception {
25+
String parsedContent = TikaParser.parseToXML(item.getInputStream());
26+
InputSource inputSource = new InputSource(new StringReader(parsedContent.toString()));
27+
InputSourceHandle handle = new InputSourceHandle(inputSource);
28+
return handle;
29+
}
30+
31+
32+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
package com.marklogic.spring.batch.item.file.support;
2+
3+
import org.apache.tika.exception.TikaException;
4+
import org.apache.tika.metadata.Metadata;
5+
import org.apache.tika.parser.AutoDetectParser;
6+
import org.apache.tika.sax.ToXMLContentHandler;
7+
import org.xml.sax.ContentHandler;
8+
import org.xml.sax.SAXException;
9+
10+
import java.io.IOException;
11+
import java.io.InputStream;
12+
13+
public class TikaParser {
14+
15+
public static String parseToXML(InputStream inputStream) throws IOException, SAXException, TikaException {
16+
ContentHandler handler = new ToXMLContentHandler();
17+
AutoDetectParser parser = new AutoDetectParser();
18+
Metadata metadata = new Metadata();
19+
parser.parse(inputStream, handler, metadata);
20+
return handler.toString();
21+
}
22+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package com.marklogic.spring.batch.item.file.support;
2+
3+
import org.junit.Before;
4+
import org.junit.Test;
5+
import org.springframework.core.io.ClassPathResource;
6+
import org.springframework.core.io.Resource;
7+
8+
import static org.hamcrest.MatcherAssert.assertThat;
9+
import static org.hamcrest.CoreMatchers.*;
10+
11+
public class TikaParserTest {
12+
13+
private Resource wordDocument;
14+
private String parsedXml;
15+
16+
@Test
17+
public void parseWordDocumentTest() throws Exception {
18+
givenWordDocument("word/test-1.docx");
19+
whenDocumentIsParsed();
20+
thenContainsText();
21+
}
22+
23+
public void givenWordDocument(String path) {
24+
wordDocument = new ClassPathResource(path);
25+
}
26+
27+
public void whenDocumentIsParsed() throws Exception {
28+
parsedXml = TikaParser.parseToXML(wordDocument.getInputStream());
29+
}
30+
31+
public void thenContainsText() {
32+
assertThat(parsedXml, containsString("The quick brown fox"));
33+
}
34+
}
11.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)