Skip to content
This repository was archived by the owner on Feb 27, 2023. It is now read-only.

Commit 3f7472b

Browse files
authored
Merge pull request #155 from sastafford/feature/99-naturalLanguageProcessing
Feature/99 natural language processing
2 parents c7bc5cc + c931b36 commit 3f7472b

22 files changed

+548
-1
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,4 +107,4 @@ task testExamples(type: GradleBuild) {
107107
tasks = [ 'test' ]
108108
}
109109

110-
task testAll(dependsOn:[':core:test', 'jobs:test', 'testExamples'] )
110+
task testAll(dependsOn:[':core:test', 'examples:test', 'testExamples'] )
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
plugins {
2+
id "com.marklogic.ml-gradle" version "2.3.3"
3+
id "java"
4+
id "application"
5+
}
6+
7+
repositories {
8+
jcenter()
9+
mavenLocal()
10+
}
11+
12+
dependencies {
13+
compile "com.marklogic:marklogic-spring-batch-core:0.6.0"
14+
compile "com.marklogic:ml-javaclient-util:2.9.1"
15+
compile "org.apache.opennlp:opennlp-tools:1.6.0"
16+
17+
testCompile "com.marklogic:marklogic-spring-batch-test:0.6.0"
18+
19+
}
20+
21+
distributions {
22+
main {
23+
baseName = 'baseJob'
24+
}
25+
}
26+
27+
mainClassName = "com.marklogic.spring.batch.Main"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
artifactId=entityEnrichment
2+
3+
mlHost=oscar
4+
mlAppName=marklogic-spring-batch-test
5+
mlJobRepositoryName=marklogic-spring-batch-test
6+
mlRestPort=8200
7+
mlUsername=admin
8+
mlPassword=admin
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
rootProject.name="entityEnrichment"
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
package example;
2+
3+
import com.marklogic.client.DatabaseClient;
4+
import com.marklogic.client.document.ServerTransform;
5+
import com.marklogic.client.document.XMLDocumentManager;
6+
import com.marklogic.client.io.StringHandle;
7+
import com.marklogic.client.query.CountedDistinctValue;
8+
import opennlp.tools.namefind.NameFinderME;
9+
import opennlp.tools.namefind.TokenNameFinderModel;
10+
import opennlp.tools.tokenize.Tokenizer;
11+
import opennlp.tools.tokenize.TokenizerME;
12+
import opennlp.tools.tokenize.TokenizerModel;
13+
import opennlp.tools.util.Span;
14+
import org.slf4j.Logger;
15+
import org.slf4j.LoggerFactory;
16+
import org.springframework.batch.item.ItemProcessor;
17+
import org.w3c.dom.Document;
18+
import org.w3c.dom.Element;
19+
20+
import javax.xml.parsers.DocumentBuilder;
21+
import javax.xml.parsers.DocumentBuilderFactory;
22+
import javax.xml.transform.OutputKeys;
23+
import javax.xml.transform.Transformer;
24+
import javax.xml.transform.TransformerFactory;
25+
import javax.xml.transform.dom.DOMSource;
26+
import javax.xml.transform.stream.StreamResult;
27+
import java.io.FileInputStream;
28+
import java.io.InputStream;
29+
import java.io.StringWriter;
30+
31+
public class EntityEnrichmentItemProcessor implements ItemProcessor<CountedDistinctValue, String[]> {
32+
33+
protected final Logger logger = LoggerFactory.getLogger(getClass());
34+
35+
private DatabaseClient databaseClient;
36+
private DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
37+
private final String TOKENIZER_MODEL_FILE_PATH;
38+
private final String NAMED_ENTITY_FILE_PATH;
39+
40+
public EntityEnrichmentItemProcessor(
41+
DatabaseClient client,
42+
String tokenizerModelFilePath,
43+
String namedEntityModelFilePath) {
44+
this.databaseClient = client;
45+
this.TOKENIZER_MODEL_FILE_PATH = tokenizerModelFilePath;
46+
this.NAMED_ENTITY_FILE_PATH = namedEntityModelFilePath;
47+
}
48+
49+
//Assumes that the item being passed in is a document uri
50+
@Override
51+
public String[] process(CountedDistinctValue item) throws Exception {
52+
XMLDocumentManager docMgr = databaseClient.newXMLDocumentManager();
53+
String uri = item.get("xs:string", String.class);
54+
StringHandle handle = docMgr.read(uri, new StringHandle());
55+
InputStream tokenModel = new FileInputStream(TOKENIZER_MODEL_FILE_PATH);
56+
TokenizerModel model = new TokenizerModel(tokenModel);
57+
Tokenizer tokenizer = new TokenizerME(model);
58+
String[] tokens = tokenizer.tokenize(handle.get());
59+
60+
InputStream namedEntityModel = new FileInputStream(NAMED_ENTITY_FILE_PATH);
61+
TokenNameFinderModel nameFinderModel = new TokenNameFinderModel(namedEntityModel);
62+
NameFinderME nameFinder = new NameFinderME(nameFinderModel);
63+
64+
Span[] spans = nameFinder.find(tokens);
65+
DocumentBuilder builder = dbf.newDocumentBuilder();
66+
Document doc = builder.newDocument();
67+
68+
Element root = doc.createElement("nameFinder");
69+
doc.appendChild(root);
70+
71+
StringBuilder stringBuilder = new StringBuilder("");
72+
for ( Span s : spans ) {
73+
logger.debug("Token Start: " + Integer.toString(s.getStart()));
74+
logger.debug("Token End: " + Integer.toString(s.getEnd()));
75+
String name = "";
76+
for (int i = s.getStart(); i < s.getEnd(); i++) {
77+
name = name + tokens[i] + " ";
78+
}
79+
name = name.substring(0, name.length()-1);
80+
logger.info(name);
81+
Element elName = doc.createElement("name");
82+
elName.setTextContent(name);
83+
root.appendChild(elName);
84+
}
85+
86+
TransformerFactory tf = TransformerFactory.newInstance();
87+
Transformer transformer = tf.newTransformer();
88+
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
89+
StringWriter writer = new StringWriter();
90+
transformer.transform(new DOMSource(doc), new StreamResult(writer));
91+
String output = writer.getBuffer().toString().replaceAll("\n|\r", "");
92+
93+
94+
String[] info = new String[2];
95+
info[0] = uri;
96+
info[1] = output;
97+
return info;
98+
}
99+
100+
}
101+
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package example;
2+
3+
import com.marklogic.client.query.CountedDistinctValue;
4+
import com.marklogic.spring.batch.config.AbstractMarkLogicBatchConfig;
5+
import org.springframework.batch.core.Job;
6+
import org.springframework.batch.core.Step;
7+
import org.springframework.batch.core.configuration.annotation.JobScope;
8+
import org.springframework.batch.item.ItemProcessor;
9+
import org.springframework.batch.item.ItemReader;
10+
import org.springframework.batch.item.ItemWriter;
11+
import org.springframework.beans.factory.annotation.Value;
12+
import org.springframework.context.EnvironmentAware;
13+
import org.springframework.context.annotation.Bean;
14+
import org.springframework.core.env.Environment;
15+
16+
public class EntityEnrichmentJobConfig extends AbstractMarkLogicBatchConfig implements EnvironmentAware {
17+
18+
private Environment env;
19+
20+
private final String JOB_NAME = "entityEnrichmentJob";
21+
22+
@Bean
23+
public Job job(Step step) {
24+
return jobBuilderFactory.get(JOB_NAME).start(step).build();
25+
}
26+
27+
@Bean
28+
@JobScope
29+
public Step step(@Value("#{jobParameters['tokenizer_model']}") String tokenizerModel,
30+
@Value("#{jobParameters['named_entity_model']}") String namedEntityModel) {
31+
32+
ItemReader<CountedDistinctValue> reader = new ValuesItemReader(getDatabaseClient());
33+
ItemProcessor<CountedDistinctValue, String[]> processor =
34+
new EntityEnrichmentItemProcessor(getDatabaseClient(), tokenizerModel, namedEntityModel);
35+
ItemWriter<String[]> writer = new MarkLogicPatchItemWriter(getDatabaseClient());
36+
37+
38+
return stepBuilderFactory.get("step1")
39+
.<CountedDistinctValue, String[]>chunk(getChunkSize())
40+
.reader(reader)
41+
.processor(processor)
42+
.writer(writer)
43+
.build();
44+
}
45+
46+
@Override
47+
public void setEnvironment(Environment environment) {
48+
this.env = environment;
49+
}
50+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
package example;
2+
3+
import com.marklogic.client.DatabaseClient;
4+
import com.marklogic.client.document.DocumentPatchBuilder;
5+
import com.marklogic.client.document.XMLDocumentManager;
6+
import com.marklogic.client.io.marker.DocumentPatchHandle;
7+
import com.marklogic.client.util.EditableNamespaceContext;
8+
import org.slf4j.Logger;
9+
import org.slf4j.LoggerFactory;
10+
import org.springframework.batch.item.ItemWriter;
11+
12+
import java.util.List;
13+
14+
public class MarkLogicPatchItemWriter implements ItemWriter<String[]> {
15+
protected final Logger logger = LoggerFactory.getLogger(getClass());
16+
17+
private DatabaseClient databaseClient;
18+
19+
public MarkLogicPatchItemWriter(DatabaseClient client) {
20+
this.databaseClient = client;
21+
}
22+
23+
@Override
24+
public void write(List<? extends String[]> items) throws Exception {
25+
for (String[] item : items) {
26+
String uri = item[0];
27+
String xmlPatch = item[1];
28+
logger.info(uri);
29+
XMLDocumentManager docMgr = databaseClient.newXMLDocumentManager();
30+
EditableNamespaceContext namespaces = new EditableNamespaceContext();
31+
namespaces.put("html", "http://www.w3.org/1999/xhtml");
32+
33+
DocumentPatchBuilder xmlPatchBldr = docMgr.newPatchBuilder();
34+
xmlPatchBldr.setNamespaces(namespaces);
35+
36+
//note the root element is referenced in the first parameter of this call, you may need to change based on your document
37+
DocumentPatchHandle patchHandle = xmlPatchBldr.insertFragment("/doc", DocumentPatchBuilder.Position.LAST_CHILD, xmlPatch).build();
38+
docMgr.patch(uri, patchHandle);
39+
}
40+
41+
}
42+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package example;
2+
3+
import opennlp.tools.tokenize.Tokenizer;
4+
import opennlp.tools.tokenize.TokenizerME;
5+
import opennlp.tools.tokenize.TokenizerModel;
6+
import org.springframework.context.annotation.Bean;
7+
import org.springframework.context.annotation.Configuration;
8+
9+
import java.io.FileInputStream;
10+
import java.io.FileNotFoundException;
11+
import java.io.IOException;
12+
import java.io.InputStream;
13+
14+
/**
15+
* Created by sstafford on 8/16/2016.
16+
*/
17+
@Configuration
18+
public class NaturalLanguageProcessorConfig {
19+
20+
@Bean
21+
public Tokenizer getTokenizer() throws FileNotFoundException, IOException {
22+
InputStream modelIn = new FileInputStream("src/main/resources/nlp/tokenizer/en-token.bin");
23+
TokenizerModel model = new TokenizerModel(modelIn);
24+
return new TokenizerME(model);
25+
}
26+
27+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
package example;
2+
3+
import com.marklogic.client.DatabaseClient;
4+
import com.marklogic.client.admin.QueryOptionsManager;
5+
import com.marklogic.client.helper.LoggingObject;
6+
import com.marklogic.client.io.StringHandle;
7+
import com.marklogic.client.io.ValuesHandle;
8+
import com.marklogic.client.query.*;
9+
import org.springframework.batch.item.*;
10+
11+
import java.util.ArrayList;
12+
import java.util.Arrays;
13+
import java.util.List;
14+
import java.util.ListIterator;
15+
16+
public class ValuesItemReader extends LoggingObject implements ItemReader<CountedDistinctValue>, ItemStream {
17+
18+
private DatabaseClient databaseClient;
19+
private List<CountedDistinctValue> values;
20+
private QueryManager queryMgr;
21+
private ListIterator<CountedDistinctValue> itr;
22+
private int start;
23+
private String uriQuery;
24+
25+
public int getLength() {
26+
return values.size();
27+
}
28+
29+
public ValuesItemReader(DatabaseClient client) {
30+
this.databaseClient = client;
31+
start = 1;
32+
String uriQueryOptions =
33+
"<options xmlns=\"http://marklogic.com/appservices/search\">\n" +
34+
" <search-option>unfiltered</search-option>\n" +
35+
" <quality-weight>0</quality-weight>\n" +
36+
" <values name=\"uris\">\n" +
37+
" <uri/>\n" +
38+
" </values>\n" +
39+
"</options>";
40+
QueryOptionsManager qoManager=
41+
databaseClient.newServerConfigManager().newQueryOptionsManager();
42+
qoManager.writeOptions("uris", new StringHandle(uriQueryOptions));
43+
}
44+
45+
@Override
46+
public CountedDistinctValue read() throws Exception, UnexpectedInputException, ParseException, NonTransientResourceException {
47+
return itr.hasNext() ? itr.next() : null;
48+
}
49+
50+
@Override
51+
public void open(ExecutionContext executionContext) throws ItemStreamException {
52+
queryMgr = databaseClient.newQueryManager();
53+
54+
ValuesDefinition vdef = queryMgr.newValuesDefinition("uris", "uris");
55+
StructuredQueryBuilder qb = new StructuredQueryBuilder();
56+
ValueQueryDefinition qDef = qb.collection("sourceXML");
57+
vdef.setQueryDefinition(qDef);
58+
59+
ValuesHandle results = queryMgr.values(vdef, new ValuesHandle(), start);
60+
values = new ArrayList<CountedDistinctValue>(Arrays.asList(results.getValues()));
61+
itr = values.listIterator();
62+
return;
63+
}
64+
65+
@Override
66+
public void update(ExecutionContext executionContext) throws ItemStreamException {
67+
68+
}
69+
70+
@Override
71+
public void close() throws ItemStreamException {
72+
73+
}
74+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*.bin filter=lfs diff=lfs merge=lfs -text

0 commit comments

Comments
 (0)