diff --git a/.gitignore b/.gitignore
index b8c7cbb..60a07e8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,8 @@ evaluation/datasets/**/*.jpeg
evaluation/datasets/**/*.jpg
evaluation/datasets/**/*.pgm
-*.pkl
\ No newline at end of file
+*.pkl
+
+/.bsp
+
+*.jar
diff --git a/.scalafix.conf b/.scalafix.conf
new file mode 100644
index 0000000..853bc21
--- /dev/null
+++ b/.scalafix.conf
@@ -0,0 +1,12 @@
+rules = [
+ ExplicitResultTypes, # Inserts type annotations for inferred public members.
+ NoAutoTupling, # Inserts explicit tuples for adapted argument lists for compatibility with -Yno-adapted-args.
+ OrganizeImports, # Organizes import statements.
+ RemoveUnused, # Removes unused imports and terms reported by the compiler under -Wunused.
+ DisableSyntax, # Reports an error for disabled features such as var or XML literals.
+ LeakingImplicitClassVal, # Adds 'private' to val parameters of implicit value classes.
+ NoValInForComprehension, # Removes deprecated val inside for-comprehension binders.
+ ProcedureSyntax, # Replaces deprecated Scala 2.x procedure syntax with explicit ': Unit ='.
+ RedundantSyntax # Removes redundant syntax such as `final` modifiers on an object.
+]
+
diff --git a/build.sbt b/build.sbt
index 222fecc..bdb65f1 100644
--- a/build.sbt
+++ b/build.sbt
@@ -1,14 +1,19 @@
-lazy val scala211 = "2.11.12"
-lazy val scala212 = "2.12.16"
-lazy val scala213 = "2.13.8" // Not supported yet (collections changes required in common)
-lazy val supportedScalaVersions = List(scala212, scala211)
+lazy val scala212 = "2.12.20"
+lazy val scala213 = "2.13.15"
+lazy val scala35 = "3.5.2"
+lazy val supportedScalaVersions = List(scala212, scala213, scala35)
Global / onChangedBuildSource := ReloadOnSourceChanges
ThisBuild / organization := "org.allenai"
ThisBuild / description := "Scala library to extract figures, tables, and captions from scholarly documents"
-ThisBuild / scalaVersion := scala212
+ThisBuild / scalaVersion := scala35
ThisBuild / version := "0.1.0"
+ThisBuild / semanticdbEnabled := true
+
+scalacOptions ++= Seq(
+ "-Wunused:all",
+)
lazy val projectSettings = Seq(
name := "pdffigures2",
@@ -25,13 +30,14 @@ lazy val projectSettings = Seq(
bintrayOrganization := Some("allenai"),
bintrayRepository := "maven",
libraryDependencies ++= Seq(
+ "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
"io.spray" %% "spray-json" % "1.3.6",
"com.github.scopt" %% "scopt" % "4.1.0",
- "ch.qos.logback" % "logback-classic" % "1.2.11",
- "org.slf4j" % "jcl-over-slf4j" % "1.7.36",
- "org.apache.pdfbox" % "pdfbox" % "2.0.26",
- "org.apache.pdfbox" % "fontbox" % "2.0.26",
- "com.typesafe" % "config" % "1.4.2",
+ "ch.qos.logback" % "logback-classic" % "1.5.6",
+ "org.slf4j" % "jcl-over-slf4j" % "2.0.13",
+ "org.apache.pdfbox" % "pdfbox" % "3.0.2",
+ "org.apache.pdfbox" % "fontbox" % "3.0.2",
+ "com.typesafe" % "config" % "1.4.3",
// So PDFBox can parse more image formats
// These are disabled by default, because they are not licensed flexibly enough.
@@ -40,11 +46,11 @@ lazy val projectSettings = Seq(
// "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0", // For handling jbig2 images
// So PDFBox can parse security enabled but still readable PDFs
- "org.bouncycastle" % "bcprov-jdk18on" % "1.71",
- "org.bouncycastle" % "bcmail-jdk18on" % "1.71",
- "org.bouncycastle" % "bcpkix-jdk18on" % "1.71",
+ "org.bouncycastle" % "bcprov-jdk18on" % "1.78.1",
+ "org.bouncycastle" % "bcmail-jdk18on" % "1.78.1",
+ "org.bouncycastle" % "bcpkix-jdk18on" % "1.78.1",
- "org.scalatest" %% "scalatest" % "3.2.13" % Test
+ "org.scalatest" %% "scalatest" % "3.2.19" % Test
),
pomExtra :=
@@ -61,11 +67,13 @@ lazy val root = (project in file("."))
.settings(projectSettings)
Compile / run / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
+Compile / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
assembly / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
assembly / assemblyOutputPath := file("pdffigures2.jar")
assembly / assemblyMergeStrategy := {
case x if x.endsWith("module-info.class") => MergeStrategy.discard
+ case PathList("META-INF", "versions", "9", "OSGI-INF", "MANIFEST.MF") => MergeStrategy.first
case PathList("org", "apache", "commons", xs @ _*) => MergeStrategy.first
case x =>
val oldStrategy = (assembly / assemblyMergeStrategy).value
diff --git a/project/build.properties b/project/build.properties
index 22af262..db1723b 100644
--- a/project/build.properties
+++ b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.7.1
+sbt.version=1.10.5
diff --git a/project/plugins.sbt b/project/plugins.sbt
index dbe12f8..f97c72e 100644
--- a/project/plugins.sbt
+++ b/project/plugins.sbt
@@ -3,3 +3,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")
addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.6.1")
+
+addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0")
+
diff --git a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
index 28c29f9..0f71aa1 100644
--- a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
+++ b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
@@ -1,6 +1,7 @@
package org.allenai.pdffigures2
import org.apache.pdfbox.pdmodel.font.PDFont
+
import FigureType._
case class CaptionStart(
@@ -14,12 +15,12 @@ case class CaptionStart(
paragraphStart: Boolean,
lineEnd: Boolean
) {
- val figId = (figType, name)
- val colonMatch = numberSyntax == ":"
- val periodMatch = numberSyntax == "."
- val allCapsFig = header.startsWith("FIG")
- val allCapsTable = header == "TABLE"
- val figAbbreviated = header == "Fig."
+ val figId: (FigureType.FigureType, String) = (figType, name)
+ val colonMatch: Boolean = numberSyntax == ":"
+ val periodMatch: Boolean = numberSyntax == "."
+ val allCapsFig: Boolean = header.startsWith("FIG")
+ val allCapsTable: Boolean = header == "TABLE"
+ val figAbbreviated: Boolean = header == "Fig."
}
object CaptionDetector extends Logging {
@@ -54,7 +55,7 @@ object CaptionDetector extends Logging {
standardFont: PDFont,
types: Set[FigureType]
) extends CandidateFilter {
- val name = s"Non Standard Font: ${types.toList}"
+ val name: String = s"Non Standard Font: ${types.toList}"
def accept(cc: CaptionStart): Boolean =
!types.contains(cc.figType) ||
cc.line.words.head.positions.head.getFont != standardFont
@@ -91,7 +92,7 @@ object CaptionDetector extends Logging {
}
private case class LeftAlignedOnly(figureOnly: Boolean) extends CandidateFilter {
- val name = "Left Aligned" + (if (figureOnly) " Figures" else "")
+ val name: String = "Left Aligned" + (if (figureOnly) " Figures" else "")
def accept(cc: CaptionStart): Boolean = {
figureOnly && cc.figType == FigureType.Table || (if (cc.nextLine.isDefined) {
Math.abs(
@@ -255,7 +256,7 @@ object CaptionDetector extends Logging {
}
if (!removedAny) {
logger.debug(
- s"Filtered for paragraph starts, " +
+ "Filtered for paragraph starts, " +
s"${groupedById.values.map(_.size).sum} remaining"
)
}
diff --git a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
index 29eb2d5..afca258 100644
--- a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
+++ b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
@@ -2,7 +2,8 @@ package org.allenai.pdffigures2
import org.apache.pdfbox.pdmodel.font.PDFont
-import scala.collection.{ immutable, mutable }
+import scala.collection.immutable
+import scala.collection.mutable
/** Store some statistics about the document as a whole */
case class DocumentLayout(
diff --git a/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala b/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala
index c41292a..e99f191 100644
--- a/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala
@@ -1,5 +1,7 @@
package org.allenai.pdffigures2
+import scala.annotation.unused
+
object FigureDetector {
private val MinProposalHeight = 15
@@ -232,7 +234,7 @@ object FigureDetector {
*/
// TODO it would be nice to be able to do this for downwards proposals as well
private def clipUpwardRegion(
- caption: Box,
+ @unused caption: Box,
region: Box,
graphics: Seq[Box],
otherText: Seq[Paragraph]
@@ -277,7 +279,7 @@ object FigureDetector {
private def scoreProposal(
proposal: Proposal,
graphics: Seq[Box],
- otherText: Seq[Box],
+ @unused otherText: Seq[Box],
otherProposals: Seq[Proposal],
bounds: Box
): Option[Double] = {
@@ -536,7 +538,7 @@ object FigureDetector {
def locatedFigures(
page: PageWithBodyText,
layout: DocumentLayout,
- log: Option[VisualLogger]
+ @unused log: Option[VisualLogger]
): PageWithFigures = {
val proposals = buildProposals(page, layout)
val proposalsWithCaptions = page.captions.zip(proposals)
@@ -560,7 +562,7 @@ object FigureDetector {
} else {
val bestConfiguration = cartesianProduct(validProposals.toList).view.zipWithIndex
.map {
- case (proposalsToUse, index) =>
+ case (proposalsToUse, _) =>
var props = splitProposals(proposalsToUse, allContent).toList
var scored = List[Proposal]()
var scores = List[Option[Double]]()
diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
index 1c20f75..e8cead7 100644
--- a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
@@ -1,13 +1,12 @@
package org.allenai.pdffigures2
-import org.allenai.pdffigures2.FigureExtractor.{
- Document,
- DocumentContent,
- DocumentWithRasterizedFigures
-}
-import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText }
-
-import com.typesafe.config.ConfigFactory
+import org.allenai.pdffigures2.FigureExtractor.Document
+import org.allenai.pdffigures2.FigureExtractor.DocumentContent
+import org.allenai.pdffigures2.FigureExtractor.DocumentWithRasterizedFigures
+import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection
+import org.allenai.pdffigures2.SectionedTextBuilder.PdfText
+import org.apache.pdfbox.Loader
+import org.apache.pdfbox.io.RandomAccessReadBuffer
import org.apache.pdfbox.pdmodel.PDDocument
import java.io.InputStream
@@ -209,9 +208,9 @@ object FigureExtractor {
pagesWithFigures: Seq[PageWithFigures],
pagesWithoutFigures: Seq[PageWithClassifiedText]
) {
- val pages = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
- def figures = pagesWithFigures.flatMap(_.figures)
- def failedCaptions = pagesWithFigures.flatMap(_.failedCaptions)
+ val pages: Seq[ClassifiedPage] = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
+ def figures: Seq[Figure] = pagesWithFigures.flatMap(_.figures)
+ def failedCaptions: Seq[Caption] = pagesWithFigures.flatMap(_.failedCaptions)
require(pages.head.pageNumber == 0, "Must start with page number 0")
require(
pages
@@ -239,9 +238,9 @@ object FigureExtractor {
private val figureExtractor = new FigureExtractor(true, true, true, true, true)
def fromInputStream(is: InputStream): Document =
- fromPDDocument(PDDocument.load(is))
+ fromPDDocument(Loader.loadPDF(new RandomAccessReadBuffer(is)))
- def fromPDDocument(pdDocument: PDDocument) =
+ def fromPDDocument(pdDocument: PDDocument): Document =
figureExtractor.getFiguresWithText(pdDocument)
}
diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala
index 2e36b99..a86e453 100644
--- a/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala
@@ -1,14 +1,20 @@
package org.allenai.pdffigures2
-import java.io.File
-import java.util.concurrent.atomic.AtomicInteger
-
-import ch.qos.logback.classic.{ Level, Logger }
+import ch.qos.logback.classic.Level
+import ch.qos.logback.classic.Logger
import org.allenai.pdffigures2.FigureExtractor.DocumentWithSavedFigures
import org.allenai.pdffigures2.JsonProtocol._
+import org.apache.pdfbox.Loader
import org.apache.pdfbox.pdmodel.PDDocument
import org.slf4j.LoggerFactory
+import scopt.OptionParser
+import spray.json.RootJsonFormat
+import java.io.File
+import java.util.concurrent.ForkJoinPool
+import java.util.concurrent.atomic.AtomicInteger
+import scala.annotation.unused
+import scala.collection.parallel.CollectionConverters._
import scala.collection.parallel.ForkJoinTaskSupport
/** CLI tools to parse a batch of PDFs, and then save the figures, table, captions
@@ -23,8 +29,8 @@ object FigureExtractorBatchCli extends Logging {
timeInMillis: Long
)
case class ProcessingError(filename: String, msg: Option[String], className: String)
- implicit val processingStatisticsFormat = jsonFormat4(ProcessingStatistics.apply)
- implicit val processingErrorFormat = jsonFormat3(ProcessingError.apply)
+ implicit val processingStatisticsFormat: RootJsonFormat[ProcessingStatistics] = jsonFormat4(ProcessingStatistics.apply)
+ implicit val processingErrorFormat: RootJsonFormat[ProcessingError] = jsonFormat3(ProcessingError.apply)
case class CliConfigBatch(
inputFiles: Seq[File] = Seq(),
@@ -40,9 +46,9 @@ object FigureExtractorBatchCli extends Logging {
figureFormat: String = "png"
)
- val Parser = new scopt.OptionParser[CliConfigBatch]("figure-extractor-batch") {
- head("figure-extractor-batch")
- arg[Seq[String]]("") required () action { (i, c) =>
+ val Parser: OptionParser[CliConfigBatch] = new scopt.OptionParser[CliConfigBatch]("figure-extractor-batch") {
+ this.head("figure-extractor-batch")
+ this.arg[Seq[String]]("").required().action({ (i, c) =>
val inputFiles =
if (i.size == 1) {
val file = new File(i.head)
@@ -55,50 +61,59 @@ object FigureExtractorBatchCli extends Logging {
i.map(f => new File(f)).toList
}
c.copy(inputFiles = inputFiles)
- } text "input PDF(s) or directory containing PDFs"
- opt[Int]('i', "dpi") action { (dpi, c) =>
+ }).text("input PDF(s) or directory containing PDFs")
+
+ this.opt[Int]('i', "dpi").action({ (dpi, c) =>
c.copy(dpi = dpi)
- } text
- "DPI to save the figures in (default 150)" validate { dpi =>
+ }).text("DPI to save the figures in (default 150)").validate({ dpi =>
if (dpi > 0) success else failure("DPI must > 0")
- }
- opt[String]('s', "save-stats") action { (s, c) =>
+ })
+
+ this.opt[String]('s', "save-stats").action({ (s, c) =>
c.copy(saveStats = Some(s))
- } validate { s =>
+ }).validate({ s =>
val f = new File(s)
- if (!f.exists() || f.canWrite && !f.isDirectory) {
+ if (!f.exists() || (f.canWrite && !f.isDirectory)) {
success
} else {
failure(s"Can't write to file $s")
}
- } text "Save the errors and timing information to the given file in JSON fromat"
- opt[Int]('t', "threads") action { (t, c) =>
+ }).text("Save the errors and timing information to the given file in JSON fromat")
+
+ this.opt[Int]('t', "threads").action({ (t, c) =>
c.copy(threads = t)
- } validate { t =>
+ }).validate({ t =>
if (t >= 0) success else failure("Threads must be >= 0")
- } text "Number of threads to use, 0 means using Scala's default"
- opt[Unit]('e', "ignore-error") action { (_, c) =>
+ }).text("Number of threads to use, 0 means using Scala's default")
+
+ this.opt[Unit]('e', "ignore-error").action({ (_, c) =>
c.copy(ignoreErrors = true)
- } text "Don't stop on errors, errors will be logged and also saved in `save-stats` if set"
- opt[Unit]('q', "quiet") action { (_, c) =>
+ }).text("Don't stop on errors, errors will be logged and also saved in `save-stats` if set")
+
+ this.opt[Unit]('q', "quiet").action({ (_, c) =>
c.copy(debugLogging = false)
- } text "Switches logging to INFO level"
- opt[String]('d', "figure-data-prefix") action { (o, c) =>
+ }).text("Switches logging to INFO level")
+
+ this.opt[String]('d', "figure-data-prefix").action({ (o, c) =>
c.copy(figureDataPrefix = Some(o))
- } text "Save JSON figure data to '.json'"
- opt[Unit]('c', "save-regionless-captions") action { (_, c) =>
+ }).text("Save JSON figure data to '.json'")
+
+ this.opt[Unit]('c', "save-regionless-captions").action({ (_, c) =>
c.copy(saveRegionlessCaptions = true)
- } text "Include captions for which no figure regions were found in the JSON data"
- opt[String]('g', "full-text-prefix") action { (f, c) =>
+ }).text("Include captions for which no figure regions were found in the JSON data")
+
+ this.opt[String]('g', "full-text-prefix").action({ (f, c) =>
c.copy(fullTextPrefix = Some(f))
- } text "Save the document and figures into '.json"
- opt[String]('m', "figure-prefix") action { (f, c) =>
+ }).text("Save the document and figures into '.json")
+
+ this.opt[String]('m', "figure-prefix").action({ (f, c) =>
c.copy(figureImagePrefix = Some(f))
- } text "Save figures as -
-.png. `id` " +
- "will be 1 unless multiple figures are found with the same `Name` in `input_filename`"
- opt[String]('f', "figure-format") action { (f, c) =>
+ }).text("Save figures as -
-.png. `id` " +
+ "will be 1 unless multiple figures are found with the same `Name` in `input_filename`")
+
+ this.opt[String]('f', "figure-format").action({ (f, c) =>
c.copy(figureFormat = f)
- } text "Format to save figures (default png)" validate { x =>
+ }).text("Format to save figures (default png)").validate({ x =>
if (FigureRenderer.AllowedFormats.contains(x)) {
success
} else {
@@ -107,20 +122,21 @@ object FigureExtractorBatchCli extends Logging {
s"formats: ${FigureRenderer.AllowedFormats.mkString(",")}"
)
}
- }
- checkConfig { c =>
+ })
+
+ this.checkConfig({ c =>
val badFiles =
c.inputFiles.find(f => !f.exists() || f.isDirectory || !f.getName.endsWith(".pdf"))
if (badFiles.isDefined) {
failure(s"Input file ${badFiles.get.getName} is not a PDF file")
} else if (c.saveRegionlessCaptions && c.fullTextPrefix.isDefined) {
- failure(s"Can't set both save-regionless-captions and full-text")
+ failure("Can't set both save-regionless-captions and full-text")
} else if (c.fullTextPrefix.isDefined && c.figureDataPrefix.isDefined) {
- failure(s"Can't set both full-text and figure-data-prefix")
+ failure("Can't set both full-text and figure-data-prefix")
} else {
success
}
- }
+ })
}
def getFilenames(
@@ -145,7 +161,7 @@ object FigureExtractorBatchCli extends Logging {
format: String,
dpi: Int,
figures: Seq[RasterizedFigure],
- doc: PDDocument
+ @unused doc: PDDocument
): Seq[SavedFigure] = {
val filenames = getFilenames(prefix, docName, format, figures.map(_.figure))
FigureRenderer.saveRasterizedFigures(filenames.zip(figures), format, dpi)
@@ -159,7 +175,7 @@ object FigureExtractorBatchCli extends Logging {
var doc: PDDocument = null
val figureExtractor = FigureExtractor()
try {
- doc = PDDocument.load(inputFile)
+ doc = Loader.loadPDF(inputFile)
val useCairo = FigureRenderer.CairoFormat.contains(config.figureFormat)
val inputName = inputFile.getName
val truncatedName = inputName.substring(0, inputName.lastIndexOf('.'))
@@ -306,7 +322,7 @@ object FigureExtractorBatchCli extends Logging {
val parFiles = config.inputFiles.par
if (config.threads != 0) {
parFiles.tasksupport = new ForkJoinTaskSupport(
- new scala.concurrent.forkjoin.ForkJoinPool(config.threads)
+ new ForkJoinPool(config.threads)
)
}
val onPdf = new AtomicInteger(0)
@@ -332,7 +348,7 @@ object FigureExtractorBatchCli extends Logging {
case _ => None
}
if (errors.isEmpty) {
- logger.info(s"No errors")
+ logger.info("No errors")
} else {
val errorString = errors
.map {
diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala
index 45a9885..4c373a8 100644
--- a/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala
@@ -1,8 +1,6 @@
package org.allenai.pdffigures2
-
-import org.apache.pdfbox.io.MemoryUsageSetting
-
-import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.Loader
+import scopt.OptionParser
import java.io.File
@@ -22,50 +20,60 @@ object FigureExtractorVisualizationCli extends Logging {
showCleanedFigureRegions: Boolean = false
)
- val Parser = new scopt.OptionParser[CliConfig]("figure-extractor-visualize") {
- head("figure-extractor-visualize")
- arg[String]("") required () action { (i, c) =>
+ val Parser: OptionParser[CliConfig] = new scopt.OptionParser[CliConfig]("figure-extractor-visualize") {
+ this.head("figure-extractor-visualize")
+
+ this.arg[String]("").required().action({ (i, c) =>
c.copy(input = Some(new File(i)))
- } validate { x =>
+ }).validate({ x =>
val f = new File(x)
if (!f.exists || f.isDirectory || !x.endsWith(".pdf")) {
failure(s"File $x is not a PDF file")
} else {
success
}
- } text "input PDF file"
- opt[Unit]('s', "show-steps") action { (_, c) =>
+ }).text("input PDF file")
+
+ this.opt[Unit]('s', "show-steps").action({ (_, c) =>
c.copy(showAllSteps = true)
- } text "Show all intermediate steps"
- opt[Unit]('g', "show-graphic-clustering") action { (_, c) =>
+ }).text("Show all intermediate steps")
+
+ this.opt[Unit]('g', "show-graphic-clustering").action({ (_, c) =>
c.copy(showGraphicsClustering = true)
- } text "Show graphical elements found and how they were clustered"
- opt[Unit]('x', "show-cleaned-figure-regions") action { (_, c) =>
+ }).text("Show graphical elements found and how they were clustered")
+
+ this.opt[Unit]('x', "show-cleaned-figure-regions").action({ (_, c) =>
c.copy(showCleanedFigureRegions = true)
- } text "Shows figure regions after being post-processed using the" +
- " rasterized PDF at the given DPI"
- opt[Unit]('e', "show-extractions") action { (_, c) =>
+ }).text("Shows figure regions after being post-processed using the" +
+ " rasterized PDF at the given DPI")
+
+ this.opt[Unit]('e', "show-extractions").action({ (_, c) =>
c.copy(showExtractions = true)
- } text "Show the bounding boxes of the text and graphics that were extracted"
- opt[Unit]('r', "show-regions") action { (_, c) =>
+ }).text("Show the bounding boxes of the text and graphics that were extracted")
+
+ this.opt[Unit]('r', "show-regions").action({ (_, c) =>
c.copy(showRegions = true)
- } text "Show the different regions the PDF was broken into"
- opt[Unit]('c', "show-captions") action { (_, c) =>
+ }).text("Show the different regions the PDF was broken into")
+
+ this.opt[Unit]('c', "show-captions").action({ (_, c) =>
c.copy(showCaptions = true)
- } text "Show the location of the captions"
- opt[Unit]('t', "show-sections") action { (_, c) =>
+ }).text("Show the location of the captions")
+
+ this.opt[Unit]('t', "show-sections").action({ (_, c) =>
c.copy(showSections = true)
- } text "Show the location of sections and paragraphs"
- opt[Int]('d', "display-dpi") action { (dpi, c) =>
+ }).text("Show the location of sections and paragraphs")
+
+ this.opt[Int]('d', "display-dpi").action({ (dpi, c) =>
c.copy(displayDpi = dpi)
- } validate { dpi =>
+ }).validate({ dpi =>
if (dpi > 0) success else failure("DPI must > 0")
- } text "DPI to display figures at (default 55)"
- opt[Seq[Int]]('p', "pages") action { (pages, c) =>
+ }).text("DPI to display figures at (default 55)")
+
+ this.opt[Seq[Int]]('p', "pages").action({ (pages, c) =>
c.copy(pages = Some(pages))
- } text "Pages to extract from (defaults to all), 1 is the first page" validate { x =>
+ }).text("Pages to extract from (defaults to all), 1 is the first page").validate({ x =>
if (x.exists(_ <= 0)) failure("A page was <= 0") else success
- }
+ })
}
def run(config: CliConfig): Unit = {
@@ -83,7 +91,7 @@ object FigureExtractorVisualizationCli extends Logging {
config.showSections,
config.showCleanedFigureRegions
)
- val doc = PDDocument.load(inputFile)
+ val doc = Loader.loadPDF(inputFile)
logger.info(s"Loading ${inputFile.getName}")
logger.info(s"Extracting figures from ${inputFile.getName}")
@@ -113,10 +121,10 @@ object FigureExtractorVisualizationCli extends Logging {
FigureExtractor().getFigures(doc, pages, Some(vLogger))
}
}
- logger.info(s"Displaying figures")
+ logger.info("Displaying figures")
vLogger.displayVisualLog(doc, config.displayDpi)
doc.close()
- logger.info(s"Finished")
+ logger.info("Finished")
}
def main(args: Array[String]): Unit = {
diff --git a/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala b/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala
index b5b178f..2935ac0 100644
--- a/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala
@@ -6,12 +6,13 @@ import spray.json._
import java.awt.image.BufferedImage
import java.io._
import javax.imageio.ImageIO
+import scala.annotation.unused
/** Methods rendering figures as images and saving those images to disk */
object FigureRenderer {
- val CairoFormat = Set("ps", "eps", "pdf", "svg")
- val AllowedFormats = CairoFormat ++ ImageIO.getWriterFormatNames
+ val CairoFormat: Set[String] = Set("ps", "eps", "pdf", "svg")
+ val AllowedFormats: Set[String] = CairoFormat ++ ImageIO.getWriterFormatNames
/** Maximum pixels to expand rasterized figure when cleaning */
private val MaxExpand = 20
@@ -99,7 +100,7 @@ object FigureRenderer {
page.figures.map(_.captionBoundary)).map(_.scale(scale))
var figureRegions = page.figures.map(_.regionBoundary).map(_.scale(scale))
val renderer = new InterruptiblePDFRenderer(doc)
- val pageImg = renderer.renderImageWithDPI(page.pageNumber, dpi)
+ val pageImg = renderer.renderImageWithDPI(page.pageNumber, dpi.toFloat)
val rasterized = page.figures.zipWithIndex.map {
case (fig, figureNumber) =>
val otherFigureRegions =
@@ -140,7 +141,7 @@ object FigureRenderer {
def saveRasterizedFigures(
figuresAndFilenames: Seq[(String, RasterizedFigure)],
format: String,
- dpi: Int
+ @unused dpi: Int
): Seq[SavedFigure] = {
require(ImageIO.getWriterFormatNames.contains(format), s"Can't save to format $format")
figuresAndFilenames.map {
@@ -152,37 +153,58 @@ object FigureRenderer {
/** Save figures to disk in a vector graphic format by shelling out to pdftocairo */
def saveFiguresAsImagesCairo(
- doc: PDDocument,
- figuresAndFilenames: Seq[(String, Figure)],
- format: String,
- dpi: Int
- ): Iterable[SavedFigure] = {
+ doc: PDDocument,
+ figuresAndFilenames: Seq[(String, Figure)],
+ format: String,
+ dpi: Int
+ ): Iterable[SavedFigure] = {
require(CairoFormat.contains(format), s"Cairo can't render to format $format")
val groupedByPage = figuresAndFilenames.groupBy(_._2.page)
- groupedByPage.flatMap {
- case (pageNum, pageFigures) =>
- val pageDoc = new PDDocument() // Save some IO by just sending cairo the relevant page
- pageDoc.addPage(doc.getPage(pageNum))
- val savedFigures = pageFigures.map {
- case (filename, fig) =>
- if (Thread.interrupted()) throw new InterruptedException()
- val box = fig.regionBoundary
- val x = Math.round(box.x1) - PadUnexpandedImage
- val y = Math.round(box.y1) - PadUnexpandedImage
- val w = Math.round(box.width) + PadUnexpandedImage * 2
- val h = Math.round(box.height) + PadUnexpandedImage * 2
- val cmdStr = s"pdftocairo -$format -r $dpi " +
- s"-x $x -y $y -H $h -W $w -paperw $w -paperh $h - $filename"
- val cmd = Runtime.getRuntime.exec(cmdStr)
- val outStream = cmd.getOutputStream
- pageDoc.save(outStream) // Stream the doc to cairo
- if (cmd.waitFor() != 0) {
- throw new IOException("Error using cairo to save a figure")
- }
- SavedFigure(fig, filename, dpi)
+ groupedByPage.flatMap { case (pageNum, pageFigures) =>
+ val pageDoc = new PDDocument() // Save some IO by just sending cairo the relevant page
+ pageDoc.addPage(doc.getPage(pageNum))
+ val savedFigures = pageFigures.map { case (filename, fig) =>
+ if (Thread.interrupted()) throw new InterruptedException()
+ val box = fig.regionBoundary
+ val x = Math.round(box.x1) - PadUnexpandedImage
+ val y = Math.round(box.y1) - PadUnexpandedImage
+ val w = Math.round(box.width) + PadUnexpandedImage * 2
+ val h = Math.round(box.height) + PadUnexpandedImage * 2
+
+ // Build the process with explicit arguments instead of shell string
+ val processBuilder = new ProcessBuilder(
+ "pdftocairo",
+ s"-$format",
+ "-r", dpi.toString,
+ "-x", x.toString,
+ "-y", y.toString,
+ "-H", h.toString,
+ "-W", w.toString,
+ "-paperw", w.toString,
+ "-paperh", h.toString,
+ "-", // Read from stdin
+ filename
+ )
+
+ // Start the process
+ val process = processBuilder.start()
+
+ // Get output stream and save document
+ val outStream = process.getOutputStream
+ pageDoc.save(outStream) // Stream the doc to cairo
+ outStream.close() // Important to close the stream
+
+ // Wait for process to complete and check exit code
+ if (process.waitFor() != 0) {
+ // Optionally, you could read error stream here for more detailed error message
+ val errorStream = scala.io.Source.fromInputStream(process.getErrorStream).mkString
+ throw new IOException(s"Error using cairo to save figure: $errorStream")
}
- pageDoc.close()
- savedFigures
+
+ SavedFigure(fig, filename, dpi)
+ }
+ pageDoc.close()
+ savedFigures
}
}
diff --git a/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala b/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala
index 3b96672..7b1b631 100644
--- a/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala
@@ -1,10 +1,10 @@
package org.allenai.pdffigures2
import org.apache.pdfbox.pdmodel.PDDocument
-import org.apache.pdfbox.rendering.{ ImageType, PDFRenderer }
+import org.apache.pdfbox.rendering.ImageType
+import org.apache.pdfbox.rendering.PDFRenderer
import java.awt.image.BufferedImage
-
import scala.collection.mutable
/** Finds the bounding boxes of graphical elements in a PDF by rasterizing the PDF and
@@ -22,7 +22,7 @@ object FindGraphicsRaster {
def findCCBoundingBoxes(doc: PDDocument, page: Int, remove: Iterable[Box]): List[Box] = {
val renderer = new PDFRenderer(doc)
- val img = renderer.renderImageWithDPI(page, DPI, ImageType.GRAY)
+ val img = renderer.renderImageWithDPI(page, DPI.toFloat, ImageType.GRAY)
findCCBoundingBoxes(img, remove, Threshold, DPI / 72)
}
diff --git a/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala
index 996fe17..3390268 100644
--- a/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala
+++ b/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala
@@ -51,7 +51,7 @@ object FormattingTextExtractor extends Logging {
if (nonEmptyCandidates.size >= minConsistentHeaders) {
// Check for identical text
val groupedByText = nonEmptyCandidates.map(x => x.text).groupBy(x => x)
- val (mostCommonText, count) = groupedByText.mapValues(_.size).maxBy(_._2)
+ val (mostCommonText, count) = groupedByText.view.mapValues(_.size).maxBy(_._2)
if (count >= minConsistentHeaders) {
candidates.map {
case np: Some[Paragraph] if np.get.text == mostCommonText => np
@@ -221,7 +221,7 @@ object FormattingTextExtractor extends Logging {
.find(_._2.nonEmpty)
val abstractPageNum = if (documentAbstract.isDefined) Some(documentAbstract.get._1) else None
- val textWithoutHeaders = (textPages, headers, pageNumbers).zipped.map {
+ val textWithoutHeaders = textPages.lazyZip(headers).lazyZip(pageNumbers).map {
case (textPage, header, pageNumber) =>
if (abstractPageNum.isDefined && abstractPageNum.get > textPage.pageNumber) {
logger.debug(s"Marking page ${textPage.pageNumber} as a cover page")
diff --git a/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala b/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala
index a285d27..51b3073 100644
--- a/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala
+++ b/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala
@@ -3,12 +3,13 @@ package org.allenai.pdffigures2
import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine
import org.apache.pdfbox.contentstream.operator.Operator
import org.apache.pdfbox.contentstream.operator.OperatorProcessor
-import org.apache.pdfbox.cos.{ COSBase, COSName }
+import org.apache.pdfbox.cos.COSBase
+import org.apache.pdfbox.cos.COSName
import org.apache.pdfbox.pdmodel.PDPage
import org.apache.pdfbox.pdmodel.graphics.color.PDColor
-import org.apache.pdfbox.util.Matrix
import org.apache.pdfbox.pdmodel.graphics.image.PDImage
import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask
+import org.apache.pdfbox.util.Matrix
import java.awt.Rectangle
import java.awt.geom._
@@ -51,9 +52,9 @@ object GraphicBBDetector {
class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsStreamEngine(page) {
var clipWindingRule: Int = -1
var linePath: GeneralPath = new GeneralPath
- var bounds = List[Rectangle]()
+ var bounds: List[Rectangle] = List[Rectangle]()
- class NullOp(val name: String) extends OperatorProcessor {
+ class NullOp(val name: String) extends OperatorProcessor(this) {
override def process(operator: Operator, operands: java.util.List[COSBase]): Unit = {}
def getName: String = name
}
@@ -90,7 +91,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS
super.processOperator(operator, operands)
}
- override def appendRectangle(p0: Point2D, p1: Point2D, p2: Point2D, p3: Point2D) {
+ override def appendRectangle(p0: Point2D, p1: Point2D, p2: Point2D, p3: Point2D): Unit = {
linePath.moveTo(p0.getX.toFloat, p0.getY.toFloat)
linePath.lineTo(p1.getX.toFloat, p1.getY.toFloat)
linePath.lineTo(p2.getX.toFloat, p2.getY.toFloat)
@@ -111,37 +112,37 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS
}
}
- override def strokePath() {
+ override def strokePath(): Unit = {
addLinePath(true, false)
linePath.reset()
}
- override def fillPath(windingRule: Int) {
+ override def fillPath(windingRule: Int): Unit = {
linePath.setWindingRule(windingRule)
addLinePath(false, true)
linePath.reset()
}
- override def fillAndStrokePath(windingRule: Int) {
+ override def fillAndStrokePath(windingRule: Int): Unit = {
linePath.setWindingRule(windingRule)
addLinePath(true, true)
linePath.reset()
}
- override def clip(windingRule: Int) = clipWindingRule = windingRule
+ override def clip(windingRule: Int): Unit = clipWindingRule = windingRule
- override def moveTo(x: Float, y: Float) = linePath.moveTo(x, y)
+ override def moveTo(x: Float, y: Float): Unit = linePath.moveTo(x, y)
- override def lineTo(x: Float, y: Float) = linePath.lineTo(x, y)
+ override def lineTo(x: Float, y: Float): Unit = linePath.lineTo(x, y)
- override def curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float) =
+ override def curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float): Unit =
linePath.curveTo(x1, y1, x2, y2, x3, y3)
override def getCurrentPoint: Point2D = linePath.getCurrentPoint
- override def closePath() = linePath.closePath()
+ override def closePath(): Unit = linePath.closePath()
- override def endPath() {
+ override def endPath(): Unit = {
if (clipWindingRule != -1) {
linePath.setWindingRule(clipWindingRule)
getGraphicsState.intersectClippingPath(linePath)
@@ -150,7 +151,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS
linePath.reset()
}
- override def drawImage(pdImage: PDImage) {
+ override def drawImage(pdImage: PDImage): Unit = {
val clipBounds = getGraphicsState.getCurrentClippingPath.getBounds
if (clipBounds.getHeight * clipBounds.getWidth > 0) {
val ctm: Matrix = getGraphicsState.getCurrentTransformationMatrix
@@ -177,7 +178,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS
}
}
- override def shadingFill(shadingName: COSName) {
+ override def shadingFill(shadingName: COSName): Unit = {
val newBound = getGraphicsState.getCurrentClippingPath.getBounds
if (newBound.getWidth > 0 && newBound.getHeight > 0) {
bounds = newBound :: bounds
diff --git a/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala b/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala
index 4ace54a..a69c68b 100644
--- a/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala
+++ b/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala
@@ -1,7 +1,6 @@
package org.allenai.pdffigures2
import org.allenai.pdffigures2.FigureExtractor.OcredPdfException
-
import org.apache.pdfbox.pdmodel.PDDocument
object GraphicsExtractor extends Logging {
diff --git a/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala b/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala
index 20cf1c1..a84860e 100644
--- a/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala
+++ b/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala
@@ -1,9 +1,11 @@
package org.allenai.pdffigures2
+import org.apache.pdfbox.contentstream.operator.Operator
import org.apache.pdfbox.cos.COSBase
import org.apache.pdfbox.pdmodel.PDDocument
-import org.apache.pdfbox.rendering.{ PDFRenderer, PageDrawer, PageDrawerParameters }
-import org.apache.pdfbox.contentstream.operator.Operator
+import org.apache.pdfbox.rendering.PDFRenderer
+import org.apache.pdfbox.rendering.PageDrawer
+import org.apache.pdfbox.rendering.PageDrawerParameters
class InterruptiblePDFRenderer(doc: PDDocument) extends PDFRenderer(doc) {
diff --git a/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala b/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala
index 2f2be94..0762a62 100644
--- a/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala
+++ b/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala
@@ -1,34 +1,34 @@
package org.allenai.pdffigures2
-import org.allenai.pdffigures2.FigureExtractor.{ Document, DocumentWithSavedFigures }
-import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText }
-
+import org.allenai.pdffigures2.FigureExtractor.Document
+import org.allenai.pdffigures2.FigureExtractor.DocumentWithSavedFigures
+import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection
+import org.allenai.pdffigures2.SectionedTextBuilder.PdfText
import spray.json._
-// From https://github.com/spray/spray-json/issues/200
-// to support enum -> json conversion
-class EnumJsonConverter[T <: scala.Enumeration](enu: T) extends RootJsonFormat[T#Value] {
- override def write(obj: T#Value): JsValue = JsString(obj.toString)
+class FigureTypeJsonConverter extends RootJsonFormat[FigureType.Value] {
+ override def write(obj: FigureType.Value): JsValue = JsString(obj.toString)
- override def read(json: JsValue): T#Value = {
+ override def read(json: JsValue): FigureType.Value = {
json match {
- case JsString(txt) => enu.withName(txt)
- case somethingElse => throw DeserializationException(s"Expected a value from enum $enu instead of $somethingElse")
+ case JsString(txt) => FigureType.withName(txt)
+ case somethingElse => throw DeserializationException(s"Expected a value from enum FigureType instead of $somethingElse")
}
}
}
+
trait JsonProtocol extends DefaultJsonProtocol {
// JSON formats so we can write Figures/Captions/Documents to disk
- implicit val enumConverter = new EnumJsonConverter(FigureType)
- implicit val boxFormat = jsonFormat4(Box.apply)
- implicit val captionFormat = jsonFormat5(Caption.apply)
- implicit val figureFormat = jsonFormat7(Figure.apply)
- implicit val savedFigureFormat = jsonFormat9(SavedFigure.apply)
- implicit val documentTextFormat = jsonFormat3(PdfText.apply)
- implicit val documentSectionFormat = jsonFormat2(DocumentSection.apply)
- implicit val documentFormat = jsonFormat3(Document.apply)
- implicit val documentWithFiguresFormat = jsonFormat3(DocumentWithSavedFigures.apply)
+ implicit val enumConverter: RootJsonFormat[FigureType.Value] = new FigureTypeJsonConverter()
+ implicit val boxFormat: RootJsonFormat[Box] = jsonFormat4(Box.apply)
+ implicit val captionFormat: RootJsonFormat[Caption] = jsonFormat5(Caption.apply)
+ implicit val figureFormat: RootJsonFormat[Figure] = jsonFormat7(Figure.apply)
+ implicit val savedFigureFormat: RootJsonFormat[SavedFigure] = jsonFormat9(SavedFigure.apply)
+ implicit val documentTextFormat: RootJsonFormat[PdfText] = jsonFormat3(PdfText.apply)
+ implicit val documentSectionFormat: RootJsonFormat[DocumentSection] = jsonFormat2(DocumentSection.apply)
+ implicit val documentFormat: RootJsonFormat[Document] = jsonFormat3(Document.apply)
+ implicit val documentWithFiguresFormat: RootJsonFormat[DocumentWithSavedFigures] = jsonFormat3(DocumentWithSavedFigures.apply)
}
object JsonProtocol extends JsonProtocol
diff --git a/src/main/scala/org/allenai/pdffigures2/Logging.scala b/src/main/scala/org/allenai/pdffigures2/Logging.scala
index 2fe008d..89dc046 100644
--- a/src/main/scala/org/allenai/pdffigures2/Logging.scala
+++ b/src/main/scala/org/allenai/pdffigures2/Logging.scala
@@ -7,7 +7,9 @@ import ch.qos.logback.classic.encoder.PatternLayoutEncoder
import ch.qos.logback.classic.html.HTMLLayout
import ch.qos.logback.classic.spi.ILoggingEvent
import ch.qos.logback.core._
-import ch.qos.logback.core.encoder.{ Encoder, LayoutWrappingEncoder }
+import ch.qos.logback.core.encoder.Encoder
+import ch.qos.logback.core.encoder.LayoutWrappingEncoder
+import org.slf4j.Logger
import org.slf4j.LoggerFactory
/** This trait is meant to be mixed into a class to provide logging and logging configuration.
@@ -17,7 +19,7 @@ import org.slf4j.LoggerFactory
* not constructed unless the message will be logged.
*/
trait Logging {
- val internalLogger = LoggerFactory.getLogger(this.getClass)
+ val internalLogger: Logger = LoggerFactory.getLogger(this.getClass)
object logger {
// scalastyle:ignore
diff --git a/src/main/scala/org/allenai/pdffigures2/PageStructure.scala b/src/main/scala/org/allenai/pdffigures2/PageStructure.scala
index 8c4ae2a..cbb129a 100644
--- a/src/main/scala/org/allenai/pdffigures2/PageStructure.scala
+++ b/src/main/scala/org/allenai/pdffigures2/PageStructure.scala
@@ -70,8 +70,8 @@ case class PageWithBodyText(
override def paragraphs: Seq[Paragraph] = (bodyText ++ otherText).sorted
def nonFigureText: Seq[Paragraph] = bodyText ++ captions.map(_.paragraph)
def nonFigureContent: Seq[Box] = nonFigureText.map(_.boundary) ++ nonFigureGraphics
- def possibleFigureContent = graphics ++ otherText.map(_.boundary)
- def allContent = possibleFigureContent ++ nonFigureContent
+ def possibleFigureContent: Seq[Box] = graphics ++ otherText.map(_.boundary)
+ def allContent: Seq[Box] = possibleFigureContent ++ nonFigureContent
}
case class PageWithFigures(
diff --git a/src/main/scala/org/allenai/pdffigures2/Paragraph.scala b/src/main/scala/org/allenai/pdffigures2/Paragraph.scala
index 7010f13..7cc55e0 100644
--- a/src/main/scala/org/allenai/pdffigures2/Paragraph.scala
+++ b/src/main/scala/org/allenai/pdffigures2/Paragraph.scala
@@ -3,6 +3,7 @@ package org.allenai.pdffigures2
import org.apache.pdfbox.text.TextPosition
import java.text.Normalizer
+import scala.util.matching.Regex
/** Span of text denoted by the starting and ending line number, inclusive */
case class TextSpan(start: Int, end: Int) extends Ordered[TextSpan] {
@@ -12,7 +13,7 @@ case class TextSpan(start: Int, end: Int) extends Ordered[TextSpan] {
}
object Paragraph {
- val unprintableRegex = """[\p{Cc}\p{Cf}\p{Co}\p{Cn}]""".r
+ val unprintableRegex: Regex = """[\p{Cc}\p{Cf}\p{Co}\p{Cn}]""".r
def apply(lines: List[Line]): Paragraph = Paragraph(lines, Box.container(lines.map(_.boundary)))
diff --git a/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala b/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala
index a253c87..4aba88f 100644
--- a/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala
+++ b/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala
@@ -181,7 +181,7 @@ object SectionTitleExtractor extends Logging {
def build(line: Line): SectionTitle = {
val fountCounts = line.words.flatMap(w => w.positions.map(_.getFont)).groupBy(identity)
- val mostCommonFont = fountCounts.mapValues(_.size).maxBy(_._2)._1
+ val mostCommonFont = fountCounts.view.mapValues(_.size).maxBy(_._2)._1
val fontSizes = line.words.flatMap(w => w.positions.map(_.getFontSizeInPt))
val medianFontSize = fontSizes.sorted.drop(fontSizes.size / 2).head
SectionTitle(List(line), line.boundary, isPrefixed(line), mostCommonFont, medianFontSize)
@@ -235,7 +235,7 @@ object SectionTitleExtractor extends Logging {
): Seq[PageWithClassifiedText] = {
val (strippedTextPages, sectionHeaders) =
stripSectionTitlesFromSortedParagraphs(pages.map(_.paragraphs), layout).unzip
- (pages, strippedTextPages, sectionHeaders).zipped.map {
+ pages.lazyZip(strippedTextPages).lazyZip(sectionHeaders).map {
case (page, strippedText, pageSectionHeaders) =>
PageWithClassifiedText(
page.pageNumber,
diff --git a/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala b/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala
index 67c579f..8eff012 100644
--- a/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala
+++ b/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala
@@ -40,9 +40,9 @@ object SectionedTextBuilder {
* @param paragraphs section text broken up into paragraphs
*/
case class DocumentSection(title: Option[PdfText], paragraphs: Seq[PdfText]) {
- def titleText = title.map(_.text)
- def paragraphsText = paragraphs.map(_.text)
- def bodyText = paragraphsText.mkString("\n")
+ def titleText: Option[String] = title.map(_.text)
+ def paragraphsText: Seq[String] = paragraphs.map(_.text)
+ def bodyText: String = paragraphsText.mkString("\n")
}
@tailrec
diff --git a/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala b/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala
index 75fd76d..3872b77 100644
--- a/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala
+++ b/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala
@@ -1,12 +1,14 @@
package org.allenai.pdffigures2
import org.apache.pdfbox.cos.COSBase
-import org.apache.pdfbox.pdmodel.common.PDRectangle
-import org.apache.pdfbox.pdmodel.{ PDDocument, PDPage }
-import org.apache.pdfbox.text.{ PDFTextStripper, TextPosition }
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.pdmodel.PDPage
+import org.apache.pdfbox.text.PDFTextStripper
+import org.apache.pdfbox.text.TextPosition
import java.io.Writer
-import scala.collection.{ immutable, mutable }
+import scala.collection.immutable
+import scala.collection.mutable
object TextExtractor {
@@ -140,7 +142,7 @@ private class TextExtractor extends PDFTextStripper with Logging {
// PDFBox can occasionally wildly overestimate the height of text, so if things look really
// wrong we clip the text to a sensible amount
val height = if (pos.getHeight > TextExtractor.MinHeightToClipText) {
- TextExtractor.HeightToCLipTextTo
+ TextExtractor.HeightToCLipTextTo.toFloat
} else {
pos.getHeight
}
diff --git a/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala b/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala
index d742cb6..730ea19 100644
--- a/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala
+++ b/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala
@@ -3,12 +3,12 @@ package org.allenai.pdffigures2
import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.rendering.PDFRenderer
-import java.awt.event.{ ActionEvent, KeyEvent }
+
import java.awt._
+import java.awt.event.ActionEvent
+import java.awt.event.KeyEvent
import java.awt.image.BufferedImage
-import javax.imageio.ImageIO
-import java.io.File
-import javax.swing._
+import javax.swing.{Box => _, _}
case class Annotations(
boxes: Seq[Box],
@@ -56,7 +56,7 @@ class VisualLogger(
val SectionsKey = "Sections"
// Ordered in how they will be shown to the user
- val ReservedKeys = Seq(
+ val ReservedKeys: Seq[String] = Seq(
GraphicsClusterKey,
TextAndGraphicsExtractionKey,
CaptionLocationKey,
@@ -89,7 +89,7 @@ class VisualLogger(
if (pagesToShow.nonEmpty) {
val renderer = new PDFRenderer(doc)
val visualizationPerPage = pagesToShow.map { pageNum =>
- val pageImg = renderer.renderImageWithDPI(pageNum, dpi)
+ val pageImg = renderer.renderImageWithDPI(pageNum, dpi.toFloat)
val imagesToShow = keysToShow.map { key =>
val annotations = logs(key).getOrElse(pageNum, Seq())
val img = cloneImage(pageImg)
@@ -99,7 +99,7 @@ class VisualLogger(
val dash = if (annotation.dashed) Array[Float](2) else null
g.setStroke(
new BasicStroke(
- annotation.thickness,
+ annotation.thickness.toFloat,
BasicStroke.CAP_BUTT,
BasicStroke.JOIN_BEVEL,
0.0f,
@@ -177,13 +177,13 @@ class VisualLogger(
// So our frame can be closed by hot key, on OSX this means at least Cmd-W works
val closeKey = KeyStroke.getKeyStroke(
KeyEvent.VK_W,
- Toolkit.getDefaultToolkit.getMenuShortcutKeyMask
+ Toolkit.getDefaultToolkit.getMenuShortcutKeyMaskEx
)
panel.getInputMap.put(closeKey, "closeWindow")
panel.getActionMap.put(
"closeWindow",
new AbstractAction("Close Window") {
- override def actionPerformed(e: ActionEvent) {
+ override def actionPerformed(e: ActionEvent): Unit = {
frame.setVisible(false)
frame.dispose()
}
diff --git a/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala b/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala
index d79da7d..fce800d 100644
--- a/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala
+++ b/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala
@@ -1,6 +1,7 @@
package org.allenai.pdffigures2
-import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.Loader
+import org.apache.pdfbox.io.RandomAccessReadBuffer
import org.scalatest.funsuite.AnyFunSuite
/** These tests verify that figure extraction filters are successfully catching and removing bad
@@ -26,9 +27,11 @@ class TestExtractionFilters extends AnyFunSuite {
* These extractions should be filtered out for being too close to the page boundary.
*/
test("Page boundary filter should filter out bad extractions") {
- val pdf = PDDocument.load(
- getClass.getClassLoader.getResourceAsStream(
- "test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf"
+ val pdf = Loader.loadPDF(
+ new RandomAccessReadBuffer(
+ getClass.getClassLoader.getResourceAsStream(
+ "test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf"
+ )
)
)
val figures = extractor.getFigures(pdf)
@@ -41,9 +44,11 @@ class TestExtractionFilters extends AnyFunSuite {
* This extraction should be filtered out for splitting a figure.
*/
test("Graphics split filter should filter out bad extractions") {
- val pdf = PDDocument.load(
- getClass.getClassLoader.getResourceAsStream(
- "test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf"
+ val pdf = Loader.loadPDF(
+ new RandomAccessReadBuffer(
+ getClass.getClassLoader.getResourceAsStream(
+ "test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf"
+ )
)
)
val figures = extractor.getFigures(pdf, pages = Some(Seq(6)))
@@ -54,24 +59,26 @@ class TestExtractionFilters extends AnyFunSuite {
* This ensures that when figures are empty, it's not because figure extraction is broken.
*/
test("Figures should all be extracted") {
- val pdf = PDDocument.load(
- getClass.getClassLoader.getResourceAsStream(
- "test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf"
+ val pdf = Loader.loadPDF(
+ new RandomAccessReadBuffer(
+ getClass.getClassLoader.getResourceAsStream(
+ "test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf"
+ )
)
)
val figures = extractor.getFigures(pdf).toList
assert(figures.length === 2)
- assert(figures(1).figType === FigureType.Table)
- assert(figures(1).name === "1")
- assert(figures(1).page === 4)
- assert(
- figures(1).caption === "Table 1: Over a set of ten relations, TEXTRUNNER achieved a 33% lower error rate than KNOWITALL, while finding approximately as many correct extractions."
- )
- assert(figures(0).figType === FigureType.Figure)
+ assert(figures(0).figType === FigureType.Table)
assert(figures(0).name === "1")
assert(figures(0).page === 4)
assert(
- figures(0).caption === "Figure 1: Overview of the tuples extracted from 9 million Web page corpus. 7.8 million well-formed tuples are found having probability ≥ 0.8. Of those, TEXTRUNNER finds 1 million concrete tuples with arguments grounded in particular real-world entities, 88.1% of which are correct, and 6.8 million tuples reflecting abstract assertions, 79.2% of which are correct."
+ figures(0).caption === "Table 1: Over a set of ten relations, TEXTRUNNER achieved a 33% lower error rate than KNOWITALL, while finding approximately as many correct extractions."
+ )
+ assert(figures(1).figType === FigureType.Figure)
+ assert(figures(1).name === "1")
+ assert(figures(1).page === 4)
+ assert(
+ figures(1).caption === "Figure 1: Overview of the tuples extracted from 9 million Web page corpus. 7.8 million well-formed tuples are found having probability ≥ 0.8. Of those, TEXTRUNNER finds 1 million concrete tuples with arguments grounded in particular real-world entities, 88.1% of which are correct, and 6.8 million tuples reflecting abstract assertions, 79.2% of which are correct."
)
}
}