Skip to content

Commit

Permalink
2.3.1
Browse files Browse the repository at this point in the history
change the tika fork parser that takes a filename input and modify it slightly so that instead of sending the
socket the content input stream, send the filename so that the tika fork main can use the FileInputStream directly.
This will cut out some unnecessary socket communication.
  • Loading branch information
nddipiazza committed Jun 13, 2020
1 parent 952163a commit fa9ab78
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 18 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
2.3.1

change the tika fork parser that takes a filename input and modify it slightly so that instead of sending the
socket the content input stream, send the filename so that the tika fork main can use the FileInputStream directly.
This will cut out some unnecessary socket communication.

2.3

bump to tika 1.24.1 instead of using the custom build
Expand Down
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# version
version=2.3
version=2.3.1

# depsre
tikaVersion=1.24.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
Expand Down Expand Up @@ -54,14 +54,14 @@ public Metadata parse(String baseUri,
OutputStream contentOutputStream,
long abortAfterMs,
long maxBytesToParse) throws InterruptedException, ExecutionException, TimeoutException, IOException {
try (FileInputStream fis = new FileInputStream(filename)) {
return parse(baseUri,
contentType,
fis,
contentOutputStream,
abortAfterMs,
maxBytesToParse);
}
ByteArrayInputStream bais = new ByteArrayInputStream(filename.getBytes());
return parseImpl(baseUri,
contentType,
bais,
contentOutputStream,
abortAfterMs,
maxBytesToParse,
true);
}

public Metadata parse(String baseUri,
Expand All @@ -70,11 +70,27 @@ public Metadata parse(String baseUri,
OutputStream contentOutputStream,
long abortAfterMs,
long maxBytesToParse) throws InterruptedException, ExecutionException, TimeoutException {
return parseImpl(baseUri,
contentType,
contentInStream,
contentOutputStream,
abortAfterMs,
maxBytesToParse,
false);
}

private Metadata parseImpl(String baseUri,
String contentType,
InputStream contentInStream,
OutputStream contentOutputStream,
long abortAfterMs,
long maxBytesToParse,
boolean inputIsFilename) throws InterruptedException, ExecutionException, TimeoutException {
ExecutorService es = Executors.newFixedThreadPool(3, new TikaRunnerThreadFactory());
try {
es.submit(() -> {
try {
writeContent(baseUri, contentType, contentInPort, contentInStream);
writeContent(baseUri, contentType, contentInPort, contentInStream, inputIsFilename);
} catch (Exception e) {
throw new RuntimeException("Failed to send content stream to forked Tika parser JVM", e);
}
Expand Down Expand Up @@ -140,13 +156,16 @@ public Metadata parse(String baseUri,
private void writeContent(String baseUri,
String contentType,
int port,
InputStream contentInStream) throws Exception {
InputStream contentInStream,
boolean inputIsFilename) throws Exception {
Socket socket = getSocket(InetAddress.getLocalHost().getHostAddress(), port);
try (OutputStream out = socket.getOutputStream()) {
out.write(baseUri.getBytes());
out.write('\n');
out.write(contentType.getBytes());
out.write('\n');
out.write(String.valueOf(inputIsFilename).getBytes());
out.write('\n');
long numChars;
do {
numChars = IOUtils.copy(contentInStream, out);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -325,31 +327,46 @@ private void parseFile(OutputStream metadataOutputStream, OutputStream contentOu

String baseUri = "";
String contentType = "";
String inputIsFilenameStr = "";
int nextChar;
while ((nextChar = inputStream.read()) != '\n') {
baseUri += (char)nextChar;
}
while ((nextChar = inputStream.read()) != '\n') {
contentType += (char)nextChar;
}

LOG.info("Next file to parse baseUri={}, contentType={}", baseUri, contentType);
while ((nextChar = inputStream.read()) != '\n') {
inputIsFilenameStr += (char)nextChar;
}

if (StringUtils.isNotBlank(contentType)) {
metadata.set(Metadata.CONTENT_TYPE, contentType);
}

TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
if (Boolean.parseBoolean(inputIsFilenameStr)) {
String tmpFilePath = IOUtils.toString(inputStream);
try (FileInputStream fis = new FileInputStream(tmpFilePath)) {
LOG.info("Next file to parse baseUri={}, contentType={}, tmpFilePath={}", baseUri, contentType, tmpFilePath);
doTikaParse(contentOutputStream, context, metadata, compositeParser, objectOutputStream, baseUri, TikaInputStream.get(fis));
}
} else {
LOG.info("Next file to parse baseUri={}, contentType={}", baseUri, contentType);

TikaParsingHandler contentHandler = getContentHandler(baseUri, contentOutputStream, extractHtmlLinks);
compositeParser.parse(tikaInputStream, contentHandler, metadata, context);
doTikaParse(contentOutputStream, context, metadata, compositeParser, objectOutputStream, baseUri, TikaInputStream.get(inputStream));
}

objectOutputStream.writeObject(metadata);
} finally {
contentOutputStream.close();
}
}

private void doTikaParse(OutputStream contentOutputStream, ParseContext context, Metadata metadata, CompositeParser compositeParser, ObjectOutputStream objectOutputStream, String baseUri, TikaInputStream tikaInputStream2) throws TikaException, IOException, SAXException {
TikaInputStream tikaInputStream = tikaInputStream2;
TikaParsingHandler contentHandler = getContentHandler(baseUri, contentOutputStream, extractHtmlLinks);
compositeParser.parse(tikaInputStream, contentHandler, metadata, context);
objectOutputStream.writeObject(metadata);
}

/**
* Runs the external tika parsing server.
*/
Expand Down
Binary file added tika-fork-main/test-files/encrypted.ppt
Binary file not shown.

0 comments on commit fa9ab78

Please sign in to comment.