Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions extractous-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,19 @@ readme = "README.md"
keywords = ["unstructured", "tika", "text", "pdf", "parser"]
categories = ["parsing", "text-processing"]

[features]
stream-attachguard = []

[[bench]]
name = "extractor"
harness = false
required-features = []

[dependencies]
libc = { version = "0.2.158" }
jni = { version = "0.21.1",features = ["invocation"] }
jni = { version = "0.21.1", features = ["invocation"] }
thiserror = { version = "1.0.63" }
bytemuck = { version = "1.17.1"}
bytemuck = { version = "1.17.1" }
# String enums
strum = { version = "0.26.2" }
strum_macros = { version = "0.26.2" }
Expand Down
2 changes: 1 addition & 1 deletion extractous-core/benches/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn extract_to_stream(c: &mut Criterion) {
c.bench_function("extract_to_stream", |b| {
b.iter(|| {
// Extract the provided file content to a stream
let stream = extractor.extract_file(file_path).unwrap();
let (stream, _) = extractor.extract_file(file_path).unwrap();
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut reader = BufReader::new(stream);
Expand Down
1 change: 0 additions & 1 deletion extractous-core/src/extractor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ impl Extractor {
self.xml_output,
)
}

}

#[cfg(test)]
Expand Down
8 changes: 4 additions & 4 deletions extractous-core/src/tika/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ pub(crate) fn vm() -> &'static JavaVM {
GRAAL_VM.get_or_init(create_vm_isolate)
}

fn get_vm_attach_current_thread<'local>() -> ExtractResult<AttachGuard<'local>> {
fn get_vm_attach_current_thread() -> ExtractResult<AttachGuard<'static>> {
// Attaching a thead that is already attached is a no-op. Good to have this in case this method
// is called from another thread
let env = vm().attach_current_thread()?;
Ok(env)
}

fn parse_to_stream(
mut env: AttachGuard,
mut env: AttachGuard<'static>,
data_source_val: JValue,
char_set: &CharSet,
pdf_conf: &PdfParserConfig,
Expand Down Expand Up @@ -60,7 +60,7 @@ fn parse_to_stream(

// Create and process the JReaderResult
let result = JReaderResult::new(&mut env, call_result_obj)?;
let j_reader = JReaderInputStream::new(&mut env, result.java_reader)?;
let j_reader = JReaderInputStream::new(env, result.java_reader)?;

Ok((StreamReader { inner: j_reader }, result.metadata))
}
Expand All @@ -71,7 +71,7 @@ pub fn parse_file(
pdf_conf: &PdfParserConfig,
office_conf: &OfficeParserConfig,
ocr_conf: &TesseractOcrConfig,
as_xml: bool
as_xml: bool,
) -> ExtractResult<(StreamReader, Metadata)> {
let mut env = get_vm_attach_current_thread()?;

Expand Down
18 changes: 9 additions & 9 deletions extractous-core/src/tika/wrappers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@ use crate::{Metadata, OfficeParserConfig, PdfParserConfig, TesseractOcrConfig, D
use bytemuck::cast_slice_mut;
use jni::objects::{GlobalRef, JByteArray, JObject, JValue};
use jni::sys::jsize;
use jni::JNIEnv;
use jni::{AttachGuard, JNIEnv};

/// Wrapper for [`JObject`]s that contain `org.apache.commons.io.input.ReaderInputStream`
/// It saves a GlobalRef to the java object, which is cleared when the last GlobalRef is dropped
/// Implements [`Drop] trait to properly close the `org.apache.commons.io.input.ReaderInputStream`
#[derive(Clone)]
pub struct JReaderInputStream {
internal: GlobalRef,
buffer: GlobalRef,
capacity: jsize,
#[cfg(feature = "stream-attachguard")]
_guard: AttachGuard<'static>,
}

impl JReaderInputStream {
pub(crate) fn new<'local>(
env: &mut JNIEnv<'local>,
obj: JObject<'local>,
) -> ExtractResult<Self> {
pub(crate) fn new(guard: AttachGuard<'static>, obj: JObject<'_>) -> ExtractResult<Self> {
// Creates new jbyte array
let capacity = DEFAULT_BUF_SIZE as jsize;
let jbyte_array = env.new_byte_array(capacity)?;
let jbyte_array = guard.new_byte_array(capacity)?;

Ok(Self {
internal: env.new_global_ref(obj)?,
buffer: env.new_global_ref(jbyte_array)?,
internal: guard.new_global_ref(obj)?,
buffer: guard.new_global_ref(jbyte_array)?,
capacity,
#[cfg(feature = "stream-attachguard")]
_guard: guard,
})
}

Expand Down