Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,8 @@ evaluation/datasets/**/*.jpeg
evaluation/datasets/**/*.jpg
evaluation/datasets/**/*.pgm

*.pkl
*.pkl

/.bsp

*.jar
12 changes: 12 additions & 0 deletions .scalafix.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
rules = [
ExplicitResultTypes, # Inserts type annotations for inferred public members.
NoAutoTupling, # Inserts explicit tuples for adapted argument lists for compatibility with -Yno-adapted-args.
OrganizeImports, # Organizes import statements.
RemoveUnused, # Removes unused imports and terms reported by the compiler under -Wunused.
DisableSyntax, # Reports an error for disabled features such as var or XML literals.
LeakingImplicitClassVal, # Adds 'private' to val parameters of implicit value classes.
NoValInForComprehension, # Removes deprecated val inside for-comprehension binders.
ProcedureSyntax, # Replaces deprecated Scala 2.x procedure syntax with explicit ': Unit ='.
RedundantSyntax # Removes redundant syntax such as `final` modifiers on an object.
]

36 changes: 22 additions & 14 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
lazy val scala211 = "2.11.12"
lazy val scala212 = "2.12.16"
lazy val scala213 = "2.13.8" // Not supported yet (collections changes required in common)
lazy val supportedScalaVersions = List(scala212, scala211)
lazy val scala212 = "2.12.20"
lazy val scala213 = "2.13.15"
lazy val scala35 = "3.5.2"
lazy val supportedScalaVersions = List(scala212, scala213, scala35)

Global / onChangedBuildSource := ReloadOnSourceChanges

ThisBuild / organization := "org.allenai"
ThisBuild / description := "Scala library to extract figures, tables, and captions from scholarly documents"
ThisBuild / scalaVersion := scala212
ThisBuild / scalaVersion := scala35
ThisBuild / version := "0.1.0"
ThisBuild / semanticdbEnabled := true

scalacOptions ++= Seq(
"-Wunused:all",
)

lazy val projectSettings = Seq(
name := "pdffigures2",
Expand All @@ -25,13 +30,14 @@ lazy val projectSettings = Seq(
bintrayOrganization := Some("allenai"),
bintrayRepository := "maven",
libraryDependencies ++= Seq(
"org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
"io.spray" %% "spray-json" % "1.3.6",
"com.github.scopt" %% "scopt" % "4.1.0",
"ch.qos.logback" % "logback-classic" % "1.2.11",
"org.slf4j" % "jcl-over-slf4j" % "1.7.36",
"org.apache.pdfbox" % "pdfbox" % "2.0.26",
"org.apache.pdfbox" % "fontbox" % "2.0.26",
"com.typesafe" % "config" % "1.4.2",
"ch.qos.logback" % "logback-classic" % "1.5.6",
"org.slf4j" % "jcl-over-slf4j" % "2.0.13",
"org.apache.pdfbox" % "pdfbox" % "3.0.2",
"org.apache.pdfbox" % "fontbox" % "3.0.2",
"com.typesafe" % "config" % "1.4.3",

// So PDFBox can parse more image formats
// These are disabled by default, because they are not licensed flexibly enough.
Expand All @@ -40,11 +46,11 @@ lazy val projectSettings = Seq(
// "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0", // For handling jbig2 images

// So PDFBox can parse security enabled but still readable PDFs
"org.bouncycastle" % "bcprov-jdk18on" % "1.71",
"org.bouncycastle" % "bcmail-jdk18on" % "1.71",
"org.bouncycastle" % "bcpkix-jdk18on" % "1.71",
"org.bouncycastle" % "bcprov-jdk18on" % "1.78.1",
"org.bouncycastle" % "bcmail-jdk18on" % "1.78.1",
"org.bouncycastle" % "bcpkix-jdk18on" % "1.78.1",

"org.scalatest" %% "scalatest" % "3.2.13" % Test
"org.scalatest" %% "scalatest" % "3.2.19" % Test
),

pomExtra :=
Expand All @@ -61,11 +67,13 @@ lazy val root = (project in file("."))
.settings(projectSettings)

Compile / run / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
Compile / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
assembly / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
assembly / assemblyOutputPath := file("pdffigures2.jar")

assembly / assemblyMergeStrategy := {
case x if x.endsWith("module-info.class") => MergeStrategy.discard
case PathList("META-INF", "versions", "9", "OSGI-INF", "MANIFEST.MF") => MergeStrategy.first
case PathList("org", "apache", "commons", xs @ _*) => MergeStrategy.first
case x =>
val oldStrategy = (assembly / assemblyMergeStrategy).value
Expand Down
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=1.7.1
sbt.version=1.10.5
3 changes: 3 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")

addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.6.1")

addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0")

19 changes: 10 additions & 9 deletions src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.allenai.pdffigures2

import org.apache.pdfbox.pdmodel.font.PDFont

import FigureType._

case class CaptionStart(
Expand All @@ -14,12 +15,12 @@ case class CaptionStart(
paragraphStart: Boolean,
lineEnd: Boolean
) {
val figId = (figType, name)
val colonMatch = numberSyntax == ":"
val periodMatch = numberSyntax == "."
val allCapsFig = header.startsWith("FIG")
val allCapsTable = header == "TABLE"
val figAbbreviated = header == "Fig."
val figId: (FigureType.FigureType, String) = (figType, name)
val colonMatch: Boolean = numberSyntax == ":"
val periodMatch: Boolean = numberSyntax == "."
val allCapsFig: Boolean = header.startsWith("FIG")
val allCapsTable: Boolean = header == "TABLE"
val figAbbreviated: Boolean = header == "Fig."
}

object CaptionDetector extends Logging {
Expand Down Expand Up @@ -54,7 +55,7 @@ object CaptionDetector extends Logging {
standardFont: PDFont,
types: Set[FigureType]
) extends CandidateFilter {
val name = s"Non Standard Font: ${types.toList}"
val name: String = s"Non Standard Font: ${types.toList}"
def accept(cc: CaptionStart): Boolean =
!types.contains(cc.figType) ||
cc.line.words.head.positions.head.getFont != standardFont
Expand Down Expand Up @@ -91,7 +92,7 @@ object CaptionDetector extends Logging {
}

private case class LeftAlignedOnly(figureOnly: Boolean) extends CandidateFilter {
val name = "Left Aligned" + (if (figureOnly) " Figures" else "")
val name: String = "Left Aligned" + (if (figureOnly) " Figures" else "")
def accept(cc: CaptionStart): Boolean = {
figureOnly && cc.figType == FigureType.Table || (if (cc.nextLine.isDefined) {
Math.abs(
Expand Down Expand Up @@ -255,7 +256,7 @@ object CaptionDetector extends Logging {
}
if (!removedAny) {
logger.debug(
s"Filtered for paragraph starts, " +
"Filtered for paragraph starts, " +
s"${groupedById.values.map(_.size).sum} remaining"
)
}
Expand Down
3 changes: 2 additions & 1 deletion src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ package org.allenai.pdffigures2

import org.apache.pdfbox.pdmodel.font.PDFont

import scala.collection.{ immutable, mutable }
import scala.collection.immutable
import scala.collection.mutable

/** Store some statistics about the document as a whole */
case class DocumentLayout(
Expand Down
10 changes: 6 additions & 4 deletions src/main/scala/org/allenai/pdffigures2/FigureDetector.scala
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.allenai.pdffigures2

import scala.annotation.unused

object FigureDetector {

private val MinProposalHeight = 15
Expand Down Expand Up @@ -232,7 +234,7 @@ object FigureDetector {
*/
// TODO it would be nice to be able to do this for downwards proposals as well
private def clipUpwardRegion(
caption: Box,
@unused caption: Box,
region: Box,
graphics: Seq[Box],
otherText: Seq[Paragraph]
Expand Down Expand Up @@ -277,7 +279,7 @@ object FigureDetector {
private def scoreProposal(
proposal: Proposal,
graphics: Seq[Box],
otherText: Seq[Box],
@unused otherText: Seq[Box],
otherProposals: Seq[Proposal],
bounds: Box
): Option[Double] = {
Expand Down Expand Up @@ -536,7 +538,7 @@ object FigureDetector {
def locatedFigures(
page: PageWithBodyText,
layout: DocumentLayout,
log: Option[VisualLogger]
@unused log: Option[VisualLogger]
): PageWithFigures = {
val proposals = buildProposals(page, layout)
val proposalsWithCaptions = page.captions.zip(proposals)
Expand All @@ -560,7 +562,7 @@ object FigureDetector {
} else {
val bestConfiguration = cartesianProduct(validProposals.toList).view.zipWithIndex
.map {
case (proposalsToUse, index) =>
case (proposalsToUse, _) =>
var props = splitProposals(proposalsToUse, allContent).toList
var scored = List[Proposal]()
var scores = List[Option[Double]]()
Expand Down
25 changes: 12 additions & 13 deletions src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
package org.allenai.pdffigures2

import org.allenai.pdffigures2.FigureExtractor.{
Document,
DocumentContent,
DocumentWithRasterizedFigures
}
import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText }

import com.typesafe.config.ConfigFactory
import org.allenai.pdffigures2.FigureExtractor.Document
import org.allenai.pdffigures2.FigureExtractor.DocumentContent
import org.allenai.pdffigures2.FigureExtractor.DocumentWithRasterizedFigures
import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection
import org.allenai.pdffigures2.SectionedTextBuilder.PdfText
import org.apache.pdfbox.Loader
import org.apache.pdfbox.io.RandomAccessReadBuffer
import org.apache.pdfbox.pdmodel.PDDocument

import java.io.InputStream
Expand Down Expand Up @@ -209,9 +208,9 @@ object FigureExtractor {
pagesWithFigures: Seq[PageWithFigures],
pagesWithoutFigures: Seq[PageWithClassifiedText]
) {
val pages = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
def figures = pagesWithFigures.flatMap(_.figures)
def failedCaptions = pagesWithFigures.flatMap(_.failedCaptions)
val pages: Seq[ClassifiedPage] = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
def figures: Seq[Figure] = pagesWithFigures.flatMap(_.figures)
def failedCaptions: Seq[Caption] = pagesWithFigures.flatMap(_.failedCaptions)
require(pages.head.pageNumber == 0, "Must start with page number 0")
require(
pages
Expand Down Expand Up @@ -239,9 +238,9 @@ object FigureExtractor {
private val figureExtractor = new FigureExtractor(true, true, true, true, true)

def fromInputStream(is: InputStream): Document =
fromPDDocument(PDDocument.load(is))
fromPDDocument(Loader.loadPDF(new RandomAccessReadBuffer(is)))

def fromPDDocument(pdDocument: PDDocument) =
def fromPDDocument(pdDocument: PDDocument): Document =
figureExtractor.getFiguresWithText(pdDocument)
}

Expand Down
Loading