This repository was archived by the owner on Feb 27, 2023. It is now read-only.
File tree Expand file tree Collapse file tree 5 files changed +91
-1
lines changed
main/java/com/marklogic/spring/batch/item/file
java/com/marklogic/spring/batch/item/file/support Expand file tree Collapse file tree 5 files changed +91
-1
lines changed Original file line number Diff line number Diff line change @@ -2,7 +2,9 @@ apply plugin: "com.jfrog.bintray"
22
33dependencies {
44 compile " org.springframework.batch:spring-batch-core:$springBatchVersion "
5- testCompile " junit:junit:4.12"
5+ compile " com.marklogic:marklogic-client-api:4.0.3"
6+ compile " org.apache.tika:tika-parsers:1.17"
7+ compile project(" :infrastructure" )
68}
79
810ext {
Original file line number Diff line number Diff line change 1+ package com .marklogic .spring .batch .item .file ;
2+
3+ import com .marklogic .client .io .InputSourceHandle ;
4+ import com .marklogic .client .io .marker .AbstractWriteHandle ;
5+ import com .marklogic .spring .batch .item .file .support .TikaParser ;
6+ import com .marklogic .spring .batch .item .processor .AbstractMarkLogicItemProcessor ;
7+ import com .marklogic .spring .batch .item .processor .support .UriGenerator ;
8+ import org .springframework .core .io .Resource ;
9+ import org .xml .sax .InputSource ;
10+
11+ import java .io .StringReader ;
12+
13+ public class TikaParserItemProcessor extends AbstractMarkLogicItemProcessor <Resource > {
14+
15+ public TikaParserItemProcessor () {
16+ super ();
17+ }
18+
19+ public TikaParserItemProcessor (UriGenerator uriGenerator ) {
20+ super (uriGenerator );
21+ }
22+
23+ @ Override
24+ public AbstractWriteHandle getContentHandle (Resource item ) throws Exception {
25+ String parsedContent = TikaParser .parseToXML (item .getInputStream ());
26+ InputSource inputSource = new InputSource (new StringReader (parsedContent .toString ()));
27+ InputSourceHandle handle = new InputSourceHandle (inputSource );
28+ return handle ;
29+ }
30+
31+
32+ }
Original file line number Diff line number Diff line change 1+ package com .marklogic .spring .batch .item .file .support ;
2+
3+ import org .apache .tika .exception .TikaException ;
4+ import org .apache .tika .metadata .Metadata ;
5+ import org .apache .tika .parser .AutoDetectParser ;
6+ import org .apache .tika .sax .ToXMLContentHandler ;
7+ import org .xml .sax .ContentHandler ;
8+ import org .xml .sax .SAXException ;
9+
10+ import java .io .IOException ;
11+ import java .io .InputStream ;
12+
13+ public class TikaParser {
14+
15+ public static String parseToXML (InputStream inputStream ) throws IOException , SAXException , TikaException {
16+ ContentHandler handler = new ToXMLContentHandler ();
17+ AutoDetectParser parser = new AutoDetectParser ();
18+ Metadata metadata = new Metadata ();
19+ parser .parse (inputStream , handler , metadata );
20+ return handler .toString ();
21+ }
22+ }
Original file line number Diff line number Diff line change 1+ package com .marklogic .spring .batch .item .file .support ;
2+
3+ import org .junit .Before ;
4+ import org .junit .Test ;
5+ import org .springframework .core .io .ClassPathResource ;
6+ import org .springframework .core .io .Resource ;
7+
8+ import static org .hamcrest .MatcherAssert .assertThat ;
9+ import static org .hamcrest .CoreMatchers .*;
10+
11+ public class TikaParserTest {
12+
13+ private Resource wordDocument ;
14+ private String parsedXml ;
15+
16+ @ Test
17+ public void parseWordDocumentTest () throws Exception {
18+ givenWordDocument ("word/test-1.docx" );
19+ whenDocumentIsParsed ();
20+ thenContainsText ();
21+ }
22+
23+ public void givenWordDocument (String path ) {
24+ wordDocument = new ClassPathResource (path );
25+ }
26+
27+ public void whenDocumentIsParsed () throws Exception {
28+ parsedXml = TikaParser .parseToXML (wordDocument .getInputStream ());
29+ }
30+
31+ public void thenContainsText () {
32+ assertThat (parsedXml , containsString ("The quick brown fox" ));
33+ }
34+ }
You can’t perform that action at this time.
0 commit comments