Skip to content

Commit fdbec9d

Browse files
committed
Fix memory leak in pdfOCR
DEVSIX-8683
1 parent d6aff8e commit fdbec9d

File tree

7 files changed

+83
-36
lines changed

7 files changed

+83
-36
lines changed

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtil.java

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,9 @@ static BufferedImage readAsPixAndConvertToBufferedImage(
137137
final File inputImage)
138138
throws IOException {
139139
Pix pix = TesseractOcrUtil.readPixFromFile(inputImage);
140-
return TesseractOcrUtil.convertPixToImage(pix);
140+
BufferedImage img = TesseractOcrUtil.convertPixToImage(pix);
141+
TesseractOcrUtil.destroyPix(pix);
142+
return img;
141143
}
142144

143145
/**
@@ -155,7 +157,7 @@ static Pix preprocessImage(final File inputFile,
155157
final int pageNumber,
156158
final ImagePreprocessingOptions imagePreprocessingOptions)
157159
throws PdfOcrTesseract4Exception {
158-
Pix pix = null;
160+
Pix pix;
159161
// read image
160162
if (isTiffImage(inputFile)) {
161163
pix = TesseractOcrUtil.readPixPageFromTiff(inputFile,
@@ -168,7 +170,11 @@ static Pix preprocessImage(final File inputFile,
168170
PdfOcrTesseract4ExceptionMessageConstant.CANNOT_READ_PROVIDED_IMAGE)
169171
.setMessageParams(inputFile.getAbsolutePath());
170172
}
171-
return TesseractOcrUtil.preprocessPix(pix, imagePreprocessingOptions);
173+
Pix preprocessedPix = TesseractOcrUtil.preprocessPix(pix, imagePreprocessingOptions);
174+
if (!TesseractOcrUtil.samePix(pix, preprocessedPix)) {
175+
TesseractOcrUtil.destroyPix(pix);
176+
}
177+
return preprocessedPix;
172178
}
173179

174180
/**

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/LeptonicaWrapper.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ public static void lept_free(Pointer pointer) {
8989
Leptonica1.lept_free(pointer);
9090
}
9191

92+
public static void pixFreeData(Pix pix) {
93+
if (JDK_MAJOR_VERSION < LEPTONICA_NOT_SUPPORTED_JDK_VERSION) {
94+
Leptonica.INSTANCE.pixFreeData(pix);
95+
return;
96+
}
97+
Leptonica1.pixFreeData(pix);
98+
}
99+
92100
public static void pixWriteMem(PointerByReference pointer, NativeSizeByReference nativeSize, Pix pix, int i) {
93101
if (JDK_MAJOR_VERSION < LEPTONICA_NOT_SUPPORTED_JDK_VERSION) {
94102
Leptonica.INSTANCE.pixWriteMem(pointer, nativeSize, pix, i);

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4ExecutableOcrEngine.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ private String preprocessImage(final File inputImage,
427427
img);
428428
}
429429
}
430+
TesseractOcrUtil.destroyPix(pix);
430431
}
431432
if (!getTesseract4OcrEngineProperties().isPreprocessingImages()
432433
|| !Files.exists(Paths.get(tmpFileName))) {

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/Tesseract4LibOcrEngine.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ This file is part of the iText (R) project.
4545
import com.itextpdf.pdfocr.tesseract4.exceptions.PdfOcrInputTesseract4Exception;
4646
import com.itextpdf.pdfocr.tesseract4.logs.Tesseract4LogMessageConstant;
4747

48+
import net.sourceforge.lept4j.Pix;
4849
import net.sourceforge.tess4j.ITesseract;
4950
import net.sourceforge.tess4j.TesseractException;
5051
import org.slf4j.LoggerFactory;
@@ -309,12 +310,14 @@ private String getOcrResultForSinglePage(final File inputImage,
309310
// preprocess if required
310311
if (getTesseract4OcrEngineProperties().isPreprocessingImages()) {
311312
// preprocess and try to ocr
313+
Pix pix = ImagePreprocessingUtil
314+
.preprocessImage(inputImage, pageNumber,
315+
getTesseract4OcrEngineProperties().getImagePreprocessingOptions());
312316
result = new TesseractOcrUtil().getOcrResultAsString(
313317
getTesseractInstance(),
314-
ImagePreprocessingUtil
315-
.preprocessImage(inputImage, pageNumber,
316-
getTesseract4OcrEngineProperties().getImagePreprocessingOptions()),
318+
pix,
317319
outputFormat);
320+
TesseractOcrUtil.destroyPix(pix);
318321
}
319322
if (result == null) {
320323
BufferedImage bufferedImage = ImagePreprocessingUtil

pdfocr-tesseract4/src/main/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtil.java

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,14 @@ static Pix readPixPageFromTiff(final File inputFile,
135135
static Pix preprocessPix(final Pix pix,
136136
final ImagePreprocessingOptions imagePreprocessingOptions) {
137137
Pix pix1 = convertToGrayscale(pix);
138-
pix1 = otsuImageThresholding(pix1, imagePreprocessingOptions);
139-
return pix1;
138+
Pix pix2 = otsuImageThresholding(pix1, imagePreprocessingOptions);
139+
if (!TesseractOcrUtil.samePix(pix, pix1) && !TesseractOcrUtil.samePix(pix1, pix2)) {
140+
// pix1 is cleaned only if it's unique here.
141+
// If it points to the same memory as pix then it should be cleaned higher in the call stack.
142+
// If it points to the same memory as pix2 then it is still required.
143+
TesseractOcrUtil.destroyPix(pix1);
144+
}
145+
return pix2;
140146
}
141147

142148
/**
@@ -175,7 +181,6 @@ static Pix convertToGrayscale(final Pix pix) {
175181
static Pix otsuImageThresholding(final Pix inputPix,
176182
final ImagePreprocessingOptions imagePreprocessingOptions) {
177183
if (inputPix != null) {
178-
Pix binarizedPix = null;
179184
if (inputPix.d == 8) {
180185
PointerByReference pointer = new PointerByReference();
181186
LeptonicaWrapper
@@ -189,18 +194,16 @@ static Pix otsuImageThresholding(final Pix inputPix,
189194
getOtsuAdaptiveThresholdSmoothingTileSize(inputPix.h,
190195
imagePreprocessingOptions.isSmoothTiling()),
191196
0,null, pointer);
192-
binarizedPix = new Pix(pointer.getValue());
197+
Pix binarizedPix = new Pix(pointer.getValue());
193198
if (binarizedPix.w > 0 && binarizedPix.h > 0) {
194-
// destroying original pix
195-
destroyPix(inputPix);
196199
return binarizedPix;
197200
} else {
198201
final String logMessage = MessageFormatUtil.format(
199202
Tesseract4LogMessageConstant.CANNOT_BINARIZE_IMAGE,
200203
inputPix.d);
201204
LOGGER.info(logMessage);
202205
// destroying created PointerByReference object
203-
destroyPix(binarizedPix);
206+
TesseractOcrUtil.destroyPix(binarizedPix);
204207
return inputPix;
205208
}
206209
} else {
@@ -251,8 +254,27 @@ static int getImagePixelColor(BufferedImage image, int x, int y) {
251254
*/
252255
static void destroyPix(Pix inputPix) {
253256
if (inputPix != null) {
254-
LeptonicaWrapper.lept_free(inputPix.getPointer());
257+
LeptonicaWrapper.pixFreeData(inputPix);
258+
}
259+
}
260+
261+
/**
262+
* Checks whether two {@link Pix} objects refer to the same content.
263+
*
264+
* @param pix1 the object to compare
265+
* @param pix2 the object to compare
266+
* @return {@code true} if objects refer to the same content, {@code false} otherwise
267+
*/
268+
static boolean samePix(Pix pix1, Pix pix2) {
269+
if (pix1 == pix2) {
270+
return true;
271+
}
272+
if (pix1 == null || pix2 == null) {
273+
return false;
255274
}
275+
276+
// Both not null
277+
return pix1.getPointer().equals(pix2.getPointer());
256278
}
257279

258280
/**
@@ -668,7 +690,11 @@ static Pix readPix(final byte[] imageBytes) {
668690
}
669691
if (pix != null) {
670692
int rotation = detectRotation(imageBytes);
671-
pix = rotate(pix, rotation);
693+
Pix rotatedPix = rotate(pix, rotation);
694+
if (!TesseractOcrUtil.samePix(rotatedPix, pix)) {
695+
TesseractOcrUtil.destroyPix(pix);
696+
}
697+
pix = rotatedPix;
672698
}
673699
return pix;
674700
}
@@ -801,7 +827,7 @@ static ImageData applyRotation(final ImageData imageData) {
801827
);
802828
}
803829
} finally {
804-
destroyPix(pix);
830+
TesseractOcrUtil.destroyPix(pix);
805831
}
806832
return newImageData;
807833
}

pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/ImagePreprocessingUtilTest.java

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ This file is part of the iText (R) project.
3434
import java.io.File;
3535
import java.io.IOException;
3636

37+
import net.sourceforge.lept4j.Pix;
3738
import org.junit.jupiter.api.Assertions;
3839
import org.junit.jupiter.api.Test;
3940

@@ -63,32 +64,32 @@ public void testImagePreprocessingOptions() throws IOException {
6364
String processedImg = getTargetDirectory() + "thai_02_processed.jpg";
6465
String compareImg = TEST_IMAGES_DIRECTORY + "thai_02_cmp_01.jpg";
6566

66-
TesseractOcrUtil.savePixToPngFile(processedImg,
67-
ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
68-
1,
69-
new ImagePreprocessingOptions())
70-
);
67+
Pix pix = ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
68+
1,
69+
new ImagePreprocessingOptions());
70+
TesseractOcrUtil.savePixToPngFile(processedImg, pix);
71+
TesseractOcrUtil.destroyPix(pix);
7172
compareImagesWithPrecision(compareImg, processedImg, 0.1);
7273

7374
compareImg = TEST_IMAGES_DIRECTORY + "thai_02_cmp_02.jpg";
74-
TesseractOcrUtil.savePixToPngFile(processedImg,
75-
ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
76-
1,
77-
new ImagePreprocessingOptions()
78-
.setTileWidth(300)
79-
.setTileHeight(300))
80-
);
75+
pix = ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
76+
1,
77+
new ImagePreprocessingOptions()
78+
.setTileWidth(300)
79+
.setTileHeight(300));
80+
TesseractOcrUtil.savePixToPngFile(processedImg, pix);
81+
TesseractOcrUtil.destroyPix(pix);
8182
compareImagesWithPrecision(compareImg, processedImg, 0.1);
8283

8384
compareImg = TEST_IMAGES_DIRECTORY + "thai_02_cmp_03.jpg";
84-
TesseractOcrUtil.savePixToPngFile(processedImg,
85-
ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
86-
1,
87-
new ImagePreprocessingOptions()
88-
.setTileWidth(300)
89-
.setTileHeight(300)
90-
.setSmoothTiling(false))
91-
);
85+
pix = ImagePreprocessingUtil.preprocessImage(new File(sourceImg),
86+
1,
87+
new ImagePreprocessingOptions()
88+
.setTileWidth(300)
89+
.setTileHeight(300)
90+
.setSmoothTiling(false));
91+
TesseractOcrUtil.savePixToPngFile(processedImg, pix);
92+
TesseractOcrUtil.destroyPix(pix);
9293
compareImagesWithPrecision(compareImg, processedImg, 0.1);
9394
}
9495

pdfocr-tesseract4/src/test/java/com/itextpdf/pdfocr/tesseract4/TesseractOcrUtilTest.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ public void testTesseract4OcrForPix()
6161
String result = new TesseractOcrUtil().getOcrResultAsString(
6262
tesseract4LibOcrEngine.getTesseractInstance(),
6363
pix, OutputFormat.TXT);
64+
TesseractOcrUtil.destroyPix(pix);
6465
Assertions.assertTrue(result.contains(expected));
6566
}
6667

@@ -182,6 +183,7 @@ public void testPixSavingAsPng() {
182183
Assertions.assertFalse(Files.exists(Paths.get(tmpFileName)));
183184
Pix pix = TesseractOcrUtil.readPix(new File(path));
184185
TesseractOcrUtil.savePixToPngFile(tmpFileName, pix);
186+
TesseractOcrUtil.destroyPix(pix);
185187
Assertions.assertTrue(Files.exists(Paths.get(tmpFileName)));
186188
TesseractHelper.deleteFile(tmpFileName);
187189
Assertions.assertFalse(Files.exists(Paths.get(tmpFileName)));

0 commit comments

Comments
 (0)