Skip to content

Commit

Permalink
This closes #14
Browse files Browse the repository at this point in the history
  • Loading branch information
JonathanLink committed Aug 2, 2017
1 parent 908c35e commit 09bcc6f
Showing 1 changed file with 36 additions and 34 deletions.
70 changes: 36 additions & 34 deletions PDFLayoutTextStripper.java
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
package pdftest.pt;

/*
* Author: Jonathan Link
* Email: jonathanlink[d o t]email[a t]gmail[d o t]com
* Date of creation: 13.11.2014
* Version: 2.1
* Description:
*
*
* Version 2.1 uses PDFBox 2.x. Version 1.0 used PDFBox 1.8.x
* Acknowledgement to James Sullivan for version 2.0
*
* What does it DO:
* This object converts the content of a PDF file into a String.
* The layout of the texts is transcribed as near as the one in the PDF given at the input.
*
*
* What does it NOT DO:
* Vertical texts in the PDF file are not handled for the moment.
*
*
* I would appreciate any feedback you could offer. (see my email address above)
*
*
* LICENSE:
*
*
* The MIT License (MIT)
*
* Copyright (c) 2014-2015 Jonathan Link
Expand All @@ -29,18 +31,18 @@
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*
*/

import java.io.IOException;
Expand Down Expand Up @@ -72,7 +74,7 @@ public PDFLayoutTextStripper() throws IOException {

@Override
public void processPage(PDPage page) throws IOException {
PDRectangle pageRectangle = page.getMediaBox();
PDRectangle pageRectangle = page.getMediaBox();
if (pageRectangle!= null) {
this.setCurrentPageWidth(pageRectangle.getWidth());
super.processPage(page);
Expand All @@ -85,9 +87,9 @@ public void processPage(PDPage page) throws IOException {
protected void writePage() throws IOException {
List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
for( int i = 0; i < charactersByArticle.size(); i++) {
List<TextPosition> textList = charactersByArticle.get(i);
this.sortTextPositionList(textList);
this.iterateThroughTextList(textList.iterator()) ;
List<TextPosition> textList = charactersByArticle.get(i);
this.sortTextPositionList(textList);
this.iterateThroughTextList(textList.iterator()) ;
}
this.writeToOutputStream(this.getTextLineList());
}
Expand All @@ -105,7 +107,7 @@ private void writeToOutputStream(final List<TextLine> textLineList) throws IOExc
* In order to get rid of the warning:
* TextPositionComparator class should implement Comparator<TextPosition> instead of Comparator
*/
@SuppressWarnings("unchecked")
@SuppressWarnings("unchecked")
private void sortTextPositionList(final List<TextPosition> textList) {
TextPositionComparator comparator = new TextPositionComparator();
Collections.sort(textList, comparator);
Expand All @@ -124,7 +126,7 @@ private void writeLine(final List<TextPosition> textPositionList) {
}
} else {
this.addNewLine(); // white line
}
}
}

private void iterateThroughTextList(Iterator<TextPosition> textIterator) {
Expand Down Expand Up @@ -166,12 +168,13 @@ private int getNumberOfNewLinesFromPreviousTextPosition(final TextPosition textP

float textYPosition = Math.round( textPosition.getY() );
float previousTextYPosition = Math.round( previousTextPosition.getY() );

if ( textYPosition > previousTextYPosition ) {
if ( textYPosition > previousTextYPosition && (textYPosition - previousTextYPosition > 5.5) ) {
double height = textPosition.getHeight();
int numberOfLines = (int) (Math.floor( textYPosition - previousTextYPosition) / height );
numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line
return numberOfLines ;
if (DEBUG) System.out.println(height + " " + numberOfLines);
return numberOfLines ;
} else {
return 0;
}
Expand All @@ -182,7 +185,7 @@ private TextLine addNewLine() {
textLineList.add(textLine);
return textLine;
}

private TextPosition getPreviousTextPosition() {
return this.previousTextPosition;
}
Expand All @@ -202,7 +205,7 @@ private void setCurrentPageWidth(double currentPageWidth) {
private List<TextLine> getTextLineList() {
return this.textLineList;
}

}

class TextLine {
Expand Down Expand Up @@ -233,14 +236,14 @@ public int getLineLength() {

public String getLine() {
return line;
}
}

private int computeIndexForCharacter(final Character character) {
private int computeIndexForCharacter(final Character character) {
int index = character.getIndex();
boolean isCharacterPartOfPreviousWord = character.isCharacterPartOfPreviousWord();
boolean isCharacterAtTheBeginningOfNewLine = character.isCharacterAtTheBeginningOfNewLine();
boolean isCharacterCloseToPreviousWord = character.isCharacterCloseToPreviousWord();

if ( !this.indexIsInBounds(index) ) {
return -1;
} else {
Expand Down Expand Up @@ -274,7 +277,7 @@ private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord)
nextValidIndex = lastIndex + 1;
}
if ( !isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1) ) {
nextValidIndex = nextValidIndex + 1;
nextValidIndex = nextValidIndex + 1;
}
this.setLastIndex(nextValidIndex);
return nextValidIndex;
Expand All @@ -283,7 +286,7 @@ private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord)
private int findMinimumIndexWithSpaceCharacterFromIndex(int index) {
int newIndex = index;
while( newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER ) {
newIndex = newIndex - 1;
newIndex = newIndex - 1;
}
return newIndex + 1;
}
Expand Down Expand Up @@ -325,7 +328,7 @@ public Character(char characterValue, int index, boolean isCharacterPartOfPrevio
this.isFirstCharacterOfAWord = isFirstCharacterOfAWord;
this.isCharacterAtTheBeginningOfNewLine = isCharacterAtTheBeginningOfNewLine;
this.isCharacterCloseToPreviousWord = isCharacterPartOfASentence;
if (PDFLayoutTextStripper.DEBUG) System.out.println(this.toString());
if (PDFLayoutTextStripper.DEBUG) System.out.println(this.toString());
}

public char getCharacterValue() {
Expand Down Expand Up @@ -393,12 +396,12 @@ public Character createCharacterFromTextPosition(final TextPosition textPosition
this.isCharacterCloseToPreviousWord = this.isCharacterCloseToPreviousWord(textPosition);
char character = this.getCharacterFromTextPosition(textPosition);
int index = (int)textPosition.getX() / PDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
return new Character(character,
index,
isCharacterPartOfPreviousWord,
isFirstCharacterOfAWord,
isCharacterAtTheBeginningOfNewLine,
isCharacterCloseToPreviousWord);
return new Character(character,
index,
isCharacterPartOfPreviousWord,
isFirstCharacterOfAWord,
isCharacterAtTheBeginningOfNewLine,
isCharacterCloseToPreviousWord);
}

private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPosition) {
Expand All @@ -407,7 +410,7 @@ private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPositi
}
TextPosition previousTextPosition = this.getPreviousTextPosition();
float previousTextYPosition = previousTextPosition.getY();
return ( Math.round( textPosition.getY() ) < Math.round(previousTextYPosition) );
return ( Math.round( textPosition.getY() ) < Math.round(previousTextYPosition) );
}

private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
Expand Down Expand Up @@ -444,7 +447,7 @@ private double numberOfSpacesBetweenTwoCharacters(final TextPosition textPositio
return numberOfSpaces;
}



private char getCharacterFromTextPosition(final TextPosition textPosition) {
String string = textPosition.getUnicode();
Expand All @@ -461,4 +464,3 @@ private void setPreviousTextPosition(final TextPosition previousTextPosition) {
}

}

0 comments on commit 09bcc6f

Please sign in to comment.