diff --git a/.gitignore b/.gitignore index b8c7cbb..60a07e8 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,8 @@ evaluation/datasets/**/*.jpeg evaluation/datasets/**/*.jpg evaluation/datasets/**/*.pgm -*.pkl \ No newline at end of file +*.pkl + +/.bsp + +*.jar diff --git a/.scalafix.conf b/.scalafix.conf new file mode 100644 index 0000000..853bc21 --- /dev/null +++ b/.scalafix.conf @@ -0,0 +1,12 @@ +rules = [ + ExplicitResultTypes, # Inserts type annotations for inferred public members. + NoAutoTupling, # Inserts explicit tuples for adapted argument lists for compatibility with -Yno-adapted-args. + OrganizeImports, # Organizes import statements. + RemoveUnused, # Removes unused imports and terms reported by the compiler under -Wunused. + DisableSyntax, # Reports an error for disabled features such as var or XML literals. + LeakingImplicitClassVal, # Adds 'private' to val parameters of implicit value classes. + NoValInForComprehension, # Removes deprecated val inside for-comprehension binders. + ProcedureSyntax, # Replaces deprecated Scala 2.x procedure syntax with explicit ': Unit ='. + RedundantSyntax # Removes redundant syntax such as `final` modifiers on an object. +] + diff --git a/build.sbt b/build.sbt index 222fecc..bdb65f1 100644 --- a/build.sbt +++ b/build.sbt @@ -1,14 +1,19 @@ -lazy val scala211 = "2.11.12" -lazy val scala212 = "2.12.16" -lazy val scala213 = "2.13.8" // Not supported yet (collections changes required in common) -lazy val supportedScalaVersions = List(scala212, scala211) +lazy val scala212 = "2.12.20" +lazy val scala213 = "2.13.15" +lazy val scala35 = "3.5.2" +lazy val supportedScalaVersions = List(scala212, scala213, scala35) Global / onChangedBuildSource := ReloadOnSourceChanges ThisBuild / organization := "org.allenai" ThisBuild / description := "Scala library to extract figures, tables, and captions from scholarly documents" -ThisBuild / scalaVersion := scala212 +ThisBuild / scalaVersion := scala35 ThisBuild / version := "0.1.0" +ThisBuild / semanticdbEnabled := true + +scalacOptions ++= Seq( + "-Wunused:all", +) lazy val projectSettings = Seq( name := "pdffigures2", @@ -25,13 +30,14 @@ lazy val projectSettings = Seq( bintrayOrganization := Some("allenai"), bintrayRepository := "maven", libraryDependencies ++= Seq( + "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4", "io.spray" %% "spray-json" % "1.3.6", "com.github.scopt" %% "scopt" % "4.1.0", - "ch.qos.logback" % "logback-classic" % "1.2.11", - "org.slf4j" % "jcl-over-slf4j" % "1.7.36", - "org.apache.pdfbox" % "pdfbox" % "2.0.26", - "org.apache.pdfbox" % "fontbox" % "2.0.26", - "com.typesafe" % "config" % "1.4.2", + "ch.qos.logback" % "logback-classic" % "1.5.6", + "org.slf4j" % "jcl-over-slf4j" % "2.0.13", + "org.apache.pdfbox" % "pdfbox" % "3.0.2", + "org.apache.pdfbox" % "fontbox" % "3.0.2", + "com.typesafe" % "config" % "1.4.3", // So PDFBox can parse more image formats // These are disabled by default, because they are not licensed flexibly enough. @@ -40,11 +46,11 @@ lazy val projectSettings = Seq( // "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0", // For handling jbig2 images // So PDFBox can parse security enabled but still readable PDFs - "org.bouncycastle" % "bcprov-jdk18on" % "1.71", - "org.bouncycastle" % "bcmail-jdk18on" % "1.71", - "org.bouncycastle" % "bcpkix-jdk18on" % "1.71", + "org.bouncycastle" % "bcprov-jdk18on" % "1.78.1", + "org.bouncycastle" % "bcmail-jdk18on" % "1.78.1", + "org.bouncycastle" % "bcpkix-jdk18on" % "1.78.1", - "org.scalatest" %% "scalatest" % "3.2.13" % Test + "org.scalatest" %% "scalatest" % "3.2.19" % Test ), pomExtra := @@ -61,11 +67,13 @@ lazy val root = (project in file(".")) .settings(projectSettings) Compile / run / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli") +Compile / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli") assembly / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli") assembly / assemblyOutputPath := file("pdffigures2.jar") assembly / assemblyMergeStrategy := { case x if x.endsWith("module-info.class") => MergeStrategy.discard + case PathList("META-INF", "versions", "9", "OSGI-INF", "MANIFEST.MF") => MergeStrategy.first case PathList("org", "apache", "commons", xs @ _*) => MergeStrategy.first case x => val oldStrategy = (assembly / assemblyMergeStrategy).value diff --git a/project/build.properties b/project/build.properties index 22af262..db1723b 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.7.1 +sbt.version=1.10.5 diff --git a/project/plugins.sbt b/project/plugins.sbt index dbe12f8..f97c72e 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -3,3 +3,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.6.1") + +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0") + diff --git a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala index 28c29f9..0f71aa1 100644 --- a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala +++ b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala @@ -1,6 +1,7 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.pdmodel.font.PDFont + import FigureType._ case class CaptionStart( @@ -14,12 +15,12 @@ case class CaptionStart( paragraphStart: Boolean, lineEnd: Boolean ) { - val figId = (figType, name) - val colonMatch = numberSyntax == ":" - val periodMatch = numberSyntax == "." - val allCapsFig = header.startsWith("FIG") - val allCapsTable = header == "TABLE" - val figAbbreviated = header == "Fig." + val figId: (FigureType.FigureType, String) = (figType, name) + val colonMatch: Boolean = numberSyntax == ":" + val periodMatch: Boolean = numberSyntax == "." + val allCapsFig: Boolean = header.startsWith("FIG") + val allCapsTable: Boolean = header == "TABLE" + val figAbbreviated: Boolean = header == "Fig." } object CaptionDetector extends Logging { @@ -54,7 +55,7 @@ object CaptionDetector extends Logging { standardFont: PDFont, types: Set[FigureType] ) extends CandidateFilter { - val name = s"Non Standard Font: ${types.toList}" + val name: String = s"Non Standard Font: ${types.toList}" def accept(cc: CaptionStart): Boolean = !types.contains(cc.figType) || cc.line.words.head.positions.head.getFont != standardFont @@ -91,7 +92,7 @@ object CaptionDetector extends Logging { } private case class LeftAlignedOnly(figureOnly: Boolean) extends CandidateFilter { - val name = "Left Aligned" + (if (figureOnly) " Figures" else "") + val name: String = "Left Aligned" + (if (figureOnly) " Figures" else "") def accept(cc: CaptionStart): Boolean = { figureOnly && cc.figType == FigureType.Table || (if (cc.nextLine.isDefined) { Math.abs( @@ -255,7 +256,7 @@ object CaptionDetector extends Logging { } if (!removedAny) { logger.debug( - s"Filtered for paragraph starts, " + + "Filtered for paragraph starts, " + s"${groupedById.values.map(_.size).sum} remaining" ) } diff --git a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala index 29eb2d5..afca258 100644 --- a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala +++ b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala @@ -2,7 +2,8 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.pdmodel.font.PDFont -import scala.collection.{ immutable, mutable } +import scala.collection.immutable +import scala.collection.mutable /** Store some statistics about the document as a whole */ case class DocumentLayout( diff --git a/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala b/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala index c41292a..e99f191 100644 --- a/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala +++ b/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala @@ -1,5 +1,7 @@ package org.allenai.pdffigures2 +import scala.annotation.unused + object FigureDetector { private val MinProposalHeight = 15 @@ -232,7 +234,7 @@ object FigureDetector { */ // TODO it would be nice to be able to do this for downwards proposals as well private def clipUpwardRegion( - caption: Box, + @unused caption: Box, region: Box, graphics: Seq[Box], otherText: Seq[Paragraph] @@ -277,7 +279,7 @@ object FigureDetector { private def scoreProposal( proposal: Proposal, graphics: Seq[Box], - otherText: Seq[Box], + @unused otherText: Seq[Box], otherProposals: Seq[Proposal], bounds: Box ): Option[Double] = { @@ -536,7 +538,7 @@ object FigureDetector { def locatedFigures( page: PageWithBodyText, layout: DocumentLayout, - log: Option[VisualLogger] + @unused log: Option[VisualLogger] ): PageWithFigures = { val proposals = buildProposals(page, layout) val proposalsWithCaptions = page.captions.zip(proposals) @@ -560,7 +562,7 @@ object FigureDetector { } else { val bestConfiguration = cartesianProduct(validProposals.toList).view.zipWithIndex .map { - case (proposalsToUse, index) => + case (proposalsToUse, _) => var props = splitProposals(proposalsToUse, allContent).toList var scored = List[Proposal]() var scores = List[Option[Double]]() diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala index 1c20f75..e8cead7 100644 --- a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala +++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala @@ -1,13 +1,12 @@ package org.allenai.pdffigures2 -import org.allenai.pdffigures2.FigureExtractor.{ - Document, - DocumentContent, - DocumentWithRasterizedFigures -} -import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText } - -import com.typesafe.config.ConfigFactory +import org.allenai.pdffigures2.FigureExtractor.Document +import org.allenai.pdffigures2.FigureExtractor.DocumentContent +import org.allenai.pdffigures2.FigureExtractor.DocumentWithRasterizedFigures +import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection +import org.allenai.pdffigures2.SectionedTextBuilder.PdfText +import org.apache.pdfbox.Loader +import org.apache.pdfbox.io.RandomAccessReadBuffer import org.apache.pdfbox.pdmodel.PDDocument import java.io.InputStream @@ -209,9 +208,9 @@ object FigureExtractor { pagesWithFigures: Seq[PageWithFigures], pagesWithoutFigures: Seq[PageWithClassifiedText] ) { - val pages = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber) - def figures = pagesWithFigures.flatMap(_.figures) - def failedCaptions = pagesWithFigures.flatMap(_.failedCaptions) + val pages: Seq[ClassifiedPage] = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber) + def figures: Seq[Figure] = pagesWithFigures.flatMap(_.figures) + def failedCaptions: Seq[Caption] = pagesWithFigures.flatMap(_.failedCaptions) require(pages.head.pageNumber == 0, "Must start with page number 0") require( pages @@ -239,9 +238,9 @@ object FigureExtractor { private val figureExtractor = new FigureExtractor(true, true, true, true, true) def fromInputStream(is: InputStream): Document = - fromPDDocument(PDDocument.load(is)) + fromPDDocument(Loader.loadPDF(new RandomAccessReadBuffer(is))) - def fromPDDocument(pdDocument: PDDocument) = + def fromPDDocument(pdDocument: PDDocument): Document = figureExtractor.getFiguresWithText(pdDocument) } diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala index 2e36b99..a86e453 100644 --- a/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala +++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractorBatchCli.scala @@ -1,14 +1,20 @@ package org.allenai.pdffigures2 -import java.io.File -import java.util.concurrent.atomic.AtomicInteger - -import ch.qos.logback.classic.{ Level, Logger } +import ch.qos.logback.classic.Level +import ch.qos.logback.classic.Logger import org.allenai.pdffigures2.FigureExtractor.DocumentWithSavedFigures import org.allenai.pdffigures2.JsonProtocol._ +import org.apache.pdfbox.Loader import org.apache.pdfbox.pdmodel.PDDocument import org.slf4j.LoggerFactory +import scopt.OptionParser +import spray.json.RootJsonFormat +import java.io.File +import java.util.concurrent.ForkJoinPool +import java.util.concurrent.atomic.AtomicInteger +import scala.annotation.unused +import scala.collection.parallel.CollectionConverters._ import scala.collection.parallel.ForkJoinTaskSupport /** CLI tools to parse a batch of PDFs, and then save the figures, table, captions @@ -23,8 +29,8 @@ object FigureExtractorBatchCli extends Logging { timeInMillis: Long ) case class ProcessingError(filename: String, msg: Option[String], className: String) - implicit val processingStatisticsFormat = jsonFormat4(ProcessingStatistics.apply) - implicit val processingErrorFormat = jsonFormat3(ProcessingError.apply) + implicit val processingStatisticsFormat: RootJsonFormat[ProcessingStatistics] = jsonFormat4(ProcessingStatistics.apply) + implicit val processingErrorFormat: RootJsonFormat[ProcessingError] = jsonFormat3(ProcessingError.apply) case class CliConfigBatch( inputFiles: Seq[File] = Seq(), @@ -40,9 +46,9 @@ object FigureExtractorBatchCli extends Logging { figureFormat: String = "png" ) - val Parser = new scopt.OptionParser[CliConfigBatch]("figure-extractor-batch") { - head("figure-extractor-batch") - arg[Seq[String]]("") required () action { (i, c) => + val Parser: OptionParser[CliConfigBatch] = new scopt.OptionParser[CliConfigBatch]("figure-extractor-batch") { + this.head("figure-extractor-batch") + this.arg[Seq[String]]("").required().action({ (i, c) => val inputFiles = if (i.size == 1) { val file = new File(i.head) @@ -55,50 +61,59 @@ object FigureExtractorBatchCli extends Logging { i.map(f => new File(f)).toList } c.copy(inputFiles = inputFiles) - } text "input PDF(s) or directory containing PDFs" - opt[Int]('i', "dpi") action { (dpi, c) => + }).text("input PDF(s) or directory containing PDFs") + + this.opt[Int]('i', "dpi").action({ (dpi, c) => c.copy(dpi = dpi) - } text - "DPI to save the figures in (default 150)" validate { dpi => + }).text("DPI to save the figures in (default 150)").validate({ dpi => if (dpi > 0) success else failure("DPI must > 0") - } - opt[String]('s', "save-stats") action { (s, c) => + }) + + this.opt[String]('s', "save-stats").action({ (s, c) => c.copy(saveStats = Some(s)) - } validate { s => + }).validate({ s => val f = new File(s) - if (!f.exists() || f.canWrite && !f.isDirectory) { + if (!f.exists() || (f.canWrite && !f.isDirectory)) { success } else { failure(s"Can't write to file $s") } - } text "Save the errors and timing information to the given file in JSON fromat" - opt[Int]('t', "threads") action { (t, c) => + }).text("Save the errors and timing information to the given file in JSON fromat") + + this.opt[Int]('t', "threads").action({ (t, c) => c.copy(threads = t) - } validate { t => + }).validate({ t => if (t >= 0) success else failure("Threads must be >= 0") - } text "Number of threads to use, 0 means using Scala's default" - opt[Unit]('e', "ignore-error") action { (_, c) => + }).text("Number of threads to use, 0 means using Scala's default") + + this.opt[Unit]('e', "ignore-error").action({ (_, c) => c.copy(ignoreErrors = true) - } text "Don't stop on errors, errors will be logged and also saved in `save-stats` if set" - opt[Unit]('q', "quiet") action { (_, c) => + }).text("Don't stop on errors, errors will be logged and also saved in `save-stats` if set") + + this.opt[Unit]('q', "quiet").action({ (_, c) => c.copy(debugLogging = false) - } text "Switches logging to INFO level" - opt[String]('d', "figure-data-prefix") action { (o, c) => + }).text("Switches logging to INFO level") + + this.opt[String]('d', "figure-data-prefix").action({ (o, c) => c.copy(figureDataPrefix = Some(o)) - } text "Save JSON figure data to '.json'" - opt[Unit]('c', "save-regionless-captions") action { (_, c) => + }).text("Save JSON figure data to '.json'") + + this.opt[Unit]('c', "save-regionless-captions").action({ (_, c) => c.copy(saveRegionlessCaptions = true) - } text "Include captions for which no figure regions were found in the JSON data" - opt[String]('g', "full-text-prefix") action { (f, c) => + }).text("Include captions for which no figure regions were found in the JSON data") + + this.opt[String]('g', "full-text-prefix").action({ (f, c) => c.copy(fullTextPrefix = Some(f)) - } text "Save the document and figures into '.json" - opt[String]('m', "figure-prefix") action { (f, c) => + }).text("Save the document and figures into '.json") + + this.opt[String]('m', "figure-prefix").action({ (f, c) => c.copy(figureImagePrefix = Some(f)) - } text "Save figures as --.png. `id` " + - "will be 1 unless multiple figures are found with the same `Name` in `input_filename`" - opt[String]('f', "figure-format") action { (f, c) => + }).text("Save figures as --.png. `id` " + + "will be 1 unless multiple figures are found with the same `Name` in `input_filename`") + + this.opt[String]('f', "figure-format").action({ (f, c) => c.copy(figureFormat = f) - } text "Format to save figures (default png)" validate { x => + }).text("Format to save figures (default png)").validate({ x => if (FigureRenderer.AllowedFormats.contains(x)) { success } else { @@ -107,20 +122,21 @@ object FigureExtractorBatchCli extends Logging { s"formats: ${FigureRenderer.AllowedFormats.mkString(",")}" ) } - } - checkConfig { c => + }) + + this.checkConfig({ c => val badFiles = c.inputFiles.find(f => !f.exists() || f.isDirectory || !f.getName.endsWith(".pdf")) if (badFiles.isDefined) { failure(s"Input file ${badFiles.get.getName} is not a PDF file") } else if (c.saveRegionlessCaptions && c.fullTextPrefix.isDefined) { - failure(s"Can't set both save-regionless-captions and full-text") + failure("Can't set both save-regionless-captions and full-text") } else if (c.fullTextPrefix.isDefined && c.figureDataPrefix.isDefined) { - failure(s"Can't set both full-text and figure-data-prefix") + failure("Can't set both full-text and figure-data-prefix") } else { success } - } + }) } def getFilenames( @@ -145,7 +161,7 @@ object FigureExtractorBatchCli extends Logging { format: String, dpi: Int, figures: Seq[RasterizedFigure], - doc: PDDocument + @unused doc: PDDocument ): Seq[SavedFigure] = { val filenames = getFilenames(prefix, docName, format, figures.map(_.figure)) FigureRenderer.saveRasterizedFigures(filenames.zip(figures), format, dpi) @@ -159,7 +175,7 @@ object FigureExtractorBatchCli extends Logging { var doc: PDDocument = null val figureExtractor = FigureExtractor() try { - doc = PDDocument.load(inputFile) + doc = Loader.loadPDF(inputFile) val useCairo = FigureRenderer.CairoFormat.contains(config.figureFormat) val inputName = inputFile.getName val truncatedName = inputName.substring(0, inputName.lastIndexOf('.')) @@ -306,7 +322,7 @@ object FigureExtractorBatchCli extends Logging { val parFiles = config.inputFiles.par if (config.threads != 0) { parFiles.tasksupport = new ForkJoinTaskSupport( - new scala.concurrent.forkjoin.ForkJoinPool(config.threads) + new ForkJoinPool(config.threads) ) } val onPdf = new AtomicInteger(0) @@ -332,7 +348,7 @@ object FigureExtractorBatchCli extends Logging { case _ => None } if (errors.isEmpty) { - logger.info(s"No errors") + logger.info("No errors") } else { val errorString = errors .map { diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala index 45a9885..4c373a8 100644 --- a/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala +++ b/src/main/scala/org/allenai/pdffigures2/FigureExtractorVisualizationCli.scala @@ -1,8 +1,6 @@ package org.allenai.pdffigures2 - -import org.apache.pdfbox.io.MemoryUsageSetting - -import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.Loader +import scopt.OptionParser import java.io.File @@ -22,50 +20,60 @@ object FigureExtractorVisualizationCli extends Logging { showCleanedFigureRegions: Boolean = false ) - val Parser = new scopt.OptionParser[CliConfig]("figure-extractor-visualize") { - head("figure-extractor-visualize") - arg[String]("") required () action { (i, c) => + val Parser: OptionParser[CliConfig] = new scopt.OptionParser[CliConfig]("figure-extractor-visualize") { + this.head("figure-extractor-visualize") + + this.arg[String]("").required().action({ (i, c) => c.copy(input = Some(new File(i))) - } validate { x => + }).validate({ x => val f = new File(x) if (!f.exists || f.isDirectory || !x.endsWith(".pdf")) { failure(s"File $x is not a PDF file") } else { success } - } text "input PDF file" - opt[Unit]('s', "show-steps") action { (_, c) => + }).text("input PDF file") + + this.opt[Unit]('s', "show-steps").action({ (_, c) => c.copy(showAllSteps = true) - } text "Show all intermediate steps" - opt[Unit]('g', "show-graphic-clustering") action { (_, c) => + }).text("Show all intermediate steps") + + this.opt[Unit]('g', "show-graphic-clustering").action({ (_, c) => c.copy(showGraphicsClustering = true) - } text "Show graphical elements found and how they were clustered" - opt[Unit]('x', "show-cleaned-figure-regions") action { (_, c) => + }).text("Show graphical elements found and how they were clustered") + + this.opt[Unit]('x', "show-cleaned-figure-regions").action({ (_, c) => c.copy(showCleanedFigureRegions = true) - } text "Shows figure regions after being post-processed using the" + - " rasterized PDF at the given DPI" - opt[Unit]('e', "show-extractions") action { (_, c) => + }).text("Shows figure regions after being post-processed using the" + + " rasterized PDF at the given DPI") + + this.opt[Unit]('e', "show-extractions").action({ (_, c) => c.copy(showExtractions = true) - } text "Show the bounding boxes of the text and graphics that were extracted" - opt[Unit]('r', "show-regions") action { (_, c) => + }).text("Show the bounding boxes of the text and graphics that were extracted") + + this.opt[Unit]('r', "show-regions").action({ (_, c) => c.copy(showRegions = true) - } text "Show the different regions the PDF was broken into" - opt[Unit]('c', "show-captions") action { (_, c) => + }).text("Show the different regions the PDF was broken into") + + this.opt[Unit]('c', "show-captions").action({ (_, c) => c.copy(showCaptions = true) - } text "Show the location of the captions" - opt[Unit]('t', "show-sections") action { (_, c) => + }).text("Show the location of the captions") + + this.opt[Unit]('t', "show-sections").action({ (_, c) => c.copy(showSections = true) - } text "Show the location of sections and paragraphs" - opt[Int]('d', "display-dpi") action { (dpi, c) => + }).text("Show the location of sections and paragraphs") + + this.opt[Int]('d', "display-dpi").action({ (dpi, c) => c.copy(displayDpi = dpi) - } validate { dpi => + }).validate({ dpi => if (dpi > 0) success else failure("DPI must > 0") - } text "DPI to display figures at (default 55)" - opt[Seq[Int]]('p', "pages") action { (pages, c) => + }).text("DPI to display figures at (default 55)") + + this.opt[Seq[Int]]('p', "pages").action({ (pages, c) => c.copy(pages = Some(pages)) - } text "Pages to extract from (defaults to all), 1 is the first page" validate { x => + }).text("Pages to extract from (defaults to all), 1 is the first page").validate({ x => if (x.exists(_ <= 0)) failure("A page was <= 0") else success - } + }) } def run(config: CliConfig): Unit = { @@ -83,7 +91,7 @@ object FigureExtractorVisualizationCli extends Logging { config.showSections, config.showCleanedFigureRegions ) - val doc = PDDocument.load(inputFile) + val doc = Loader.loadPDF(inputFile) logger.info(s"Loading ${inputFile.getName}") logger.info(s"Extracting figures from ${inputFile.getName}") @@ -113,10 +121,10 @@ object FigureExtractorVisualizationCli extends Logging { FigureExtractor().getFigures(doc, pages, Some(vLogger)) } } - logger.info(s"Displaying figures") + logger.info("Displaying figures") vLogger.displayVisualLog(doc, config.displayDpi) doc.close() - logger.info(s"Finished") + logger.info("Finished") } def main(args: Array[String]): Unit = { diff --git a/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala b/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala index b5b178f..2935ac0 100644 --- a/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala +++ b/src/main/scala/org/allenai/pdffigures2/FigureRenderer.scala @@ -6,12 +6,13 @@ import spray.json._ import java.awt.image.BufferedImage import java.io._ import javax.imageio.ImageIO +import scala.annotation.unused /** Methods rendering figures as images and saving those images to disk */ object FigureRenderer { - val CairoFormat = Set("ps", "eps", "pdf", "svg") - val AllowedFormats = CairoFormat ++ ImageIO.getWriterFormatNames + val CairoFormat: Set[String] = Set("ps", "eps", "pdf", "svg") + val AllowedFormats: Set[String] = CairoFormat ++ ImageIO.getWriterFormatNames /** Maximum pixels to expand rasterized figure when cleaning */ private val MaxExpand = 20 @@ -99,7 +100,7 @@ object FigureRenderer { page.figures.map(_.captionBoundary)).map(_.scale(scale)) var figureRegions = page.figures.map(_.regionBoundary).map(_.scale(scale)) val renderer = new InterruptiblePDFRenderer(doc) - val pageImg = renderer.renderImageWithDPI(page.pageNumber, dpi) + val pageImg = renderer.renderImageWithDPI(page.pageNumber, dpi.toFloat) val rasterized = page.figures.zipWithIndex.map { case (fig, figureNumber) => val otherFigureRegions = @@ -140,7 +141,7 @@ object FigureRenderer { def saveRasterizedFigures( figuresAndFilenames: Seq[(String, RasterizedFigure)], format: String, - dpi: Int + @unused dpi: Int ): Seq[SavedFigure] = { require(ImageIO.getWriterFormatNames.contains(format), s"Can't save to format $format") figuresAndFilenames.map { @@ -152,37 +153,58 @@ object FigureRenderer { /** Save figures to disk in a vector graphic format by shelling out to pdftocairo */ def saveFiguresAsImagesCairo( - doc: PDDocument, - figuresAndFilenames: Seq[(String, Figure)], - format: String, - dpi: Int - ): Iterable[SavedFigure] = { + doc: PDDocument, + figuresAndFilenames: Seq[(String, Figure)], + format: String, + dpi: Int + ): Iterable[SavedFigure] = { require(CairoFormat.contains(format), s"Cairo can't render to format $format") val groupedByPage = figuresAndFilenames.groupBy(_._2.page) - groupedByPage.flatMap { - case (pageNum, pageFigures) => - val pageDoc = new PDDocument() // Save some IO by just sending cairo the relevant page - pageDoc.addPage(doc.getPage(pageNum)) - val savedFigures = pageFigures.map { - case (filename, fig) => - if (Thread.interrupted()) throw new InterruptedException() - val box = fig.regionBoundary - val x = Math.round(box.x1) - PadUnexpandedImage - val y = Math.round(box.y1) - PadUnexpandedImage - val w = Math.round(box.width) + PadUnexpandedImage * 2 - val h = Math.round(box.height) + PadUnexpandedImage * 2 - val cmdStr = s"pdftocairo -$format -r $dpi " + - s"-x $x -y $y -H $h -W $w -paperw $w -paperh $h - $filename" - val cmd = Runtime.getRuntime.exec(cmdStr) - val outStream = cmd.getOutputStream - pageDoc.save(outStream) // Stream the doc to cairo - if (cmd.waitFor() != 0) { - throw new IOException("Error using cairo to save a figure") - } - SavedFigure(fig, filename, dpi) + groupedByPage.flatMap { case (pageNum, pageFigures) => + val pageDoc = new PDDocument() // Save some IO by just sending cairo the relevant page + pageDoc.addPage(doc.getPage(pageNum)) + val savedFigures = pageFigures.map { case (filename, fig) => + if (Thread.interrupted()) throw new InterruptedException() + val box = fig.regionBoundary + val x = Math.round(box.x1) - PadUnexpandedImage + val y = Math.round(box.y1) - PadUnexpandedImage + val w = Math.round(box.width) + PadUnexpandedImage * 2 + val h = Math.round(box.height) + PadUnexpandedImage * 2 + + // Build the process with explicit arguments instead of shell string + val processBuilder = new ProcessBuilder( + "pdftocairo", + s"-$format", + "-r", dpi.toString, + "-x", x.toString, + "-y", y.toString, + "-H", h.toString, + "-W", w.toString, + "-paperw", w.toString, + "-paperh", h.toString, + "-", // Read from stdin + filename + ) + + // Start the process + val process = processBuilder.start() + + // Get output stream and save document + val outStream = process.getOutputStream + pageDoc.save(outStream) // Stream the doc to cairo + outStream.close() // Important to close the stream + + // Wait for process to complete and check exit code + if (process.waitFor() != 0) { + // Optionally, you could read error stream here for more detailed error message + val errorStream = scala.io.Source.fromInputStream(process.getErrorStream).mkString + throw new IOException(s"Error using cairo to save figure: $errorStream") } - pageDoc.close() - savedFigures + + SavedFigure(fig, filename, dpi) + } + pageDoc.close() + savedFigures } } diff --git a/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala b/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala index 3b96672..7b1b631 100644 --- a/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala +++ b/src/main/scala/org/allenai/pdffigures2/FindGraphicsRaster.scala @@ -1,10 +1,10 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.pdmodel.PDDocument -import org.apache.pdfbox.rendering.{ ImageType, PDFRenderer } +import org.apache.pdfbox.rendering.ImageType +import org.apache.pdfbox.rendering.PDFRenderer import java.awt.image.BufferedImage - import scala.collection.mutable /** Finds the bounding boxes of graphical elements in a PDF by rasterizing the PDF and @@ -22,7 +22,7 @@ object FindGraphicsRaster { def findCCBoundingBoxes(doc: PDDocument, page: Int, remove: Iterable[Box]): List[Box] = { val renderer = new PDFRenderer(doc) - val img = renderer.renderImageWithDPI(page, DPI, ImageType.GRAY) + val img = renderer.renderImageWithDPI(page, DPI.toFloat, ImageType.GRAY) findCCBoundingBoxes(img, remove, Threshold, DPI / 72) } diff --git a/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala index 996fe17..3390268 100644 --- a/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala +++ b/src/main/scala/org/allenai/pdffigures2/FormattingTextExtractor.scala @@ -51,7 +51,7 @@ object FormattingTextExtractor extends Logging { if (nonEmptyCandidates.size >= minConsistentHeaders) { // Check for identical text val groupedByText = nonEmptyCandidates.map(x => x.text).groupBy(x => x) - val (mostCommonText, count) = groupedByText.mapValues(_.size).maxBy(_._2) + val (mostCommonText, count) = groupedByText.view.mapValues(_.size).maxBy(_._2) if (count >= minConsistentHeaders) { candidates.map { case np: Some[Paragraph] if np.get.text == mostCommonText => np @@ -221,7 +221,7 @@ object FormattingTextExtractor extends Logging { .find(_._2.nonEmpty) val abstractPageNum = if (documentAbstract.isDefined) Some(documentAbstract.get._1) else None - val textWithoutHeaders = (textPages, headers, pageNumbers).zipped.map { + val textWithoutHeaders = textPages.lazyZip(headers).lazyZip(pageNumbers).map { case (textPage, header, pageNumber) => if (abstractPageNum.isDefined && abstractPageNum.get > textPage.pageNumber) { logger.debug(s"Marking page ${textPage.pageNumber} as a cover page") diff --git a/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala b/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala index a285d27..51b3073 100644 --- a/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala +++ b/src/main/scala/org/allenai/pdffigures2/GraphicBBDetector.scala @@ -3,12 +3,13 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine import org.apache.pdfbox.contentstream.operator.Operator import org.apache.pdfbox.contentstream.operator.OperatorProcessor -import org.apache.pdfbox.cos.{ COSBase, COSName } +import org.apache.pdfbox.cos.COSBase +import org.apache.pdfbox.cos.COSName import org.apache.pdfbox.pdmodel.PDPage import org.apache.pdfbox.pdmodel.graphics.color.PDColor -import org.apache.pdfbox.util.Matrix import org.apache.pdfbox.pdmodel.graphics.image.PDImage import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask +import org.apache.pdfbox.util.Matrix import java.awt.Rectangle import java.awt.geom._ @@ -51,9 +52,9 @@ object GraphicBBDetector { class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsStreamEngine(page) { var clipWindingRule: Int = -1 var linePath: GeneralPath = new GeneralPath - var bounds = List[Rectangle]() + var bounds: List[Rectangle] = List[Rectangle]() - class NullOp(val name: String) extends OperatorProcessor { + class NullOp(val name: String) extends OperatorProcessor(this) { override def process(operator: Operator, operands: java.util.List[COSBase]): Unit = {} def getName: String = name } @@ -90,7 +91,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS super.processOperator(operator, operands) } - override def appendRectangle(p0: Point2D, p1: Point2D, p2: Point2D, p3: Point2D) { + override def appendRectangle(p0: Point2D, p1: Point2D, p2: Point2D, p3: Point2D): Unit = { linePath.moveTo(p0.getX.toFloat, p0.getY.toFloat) linePath.lineTo(p1.getX.toFloat, p1.getY.toFloat) linePath.lineTo(p2.getX.toFloat, p2.getY.toFloat) @@ -111,37 +112,37 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS } } - override def strokePath() { + override def strokePath(): Unit = { addLinePath(true, false) linePath.reset() } - override def fillPath(windingRule: Int) { + override def fillPath(windingRule: Int): Unit = { linePath.setWindingRule(windingRule) addLinePath(false, true) linePath.reset() } - override def fillAndStrokePath(windingRule: Int) { + override def fillAndStrokePath(windingRule: Int): Unit = { linePath.setWindingRule(windingRule) addLinePath(true, true) linePath.reset() } - override def clip(windingRule: Int) = clipWindingRule = windingRule + override def clip(windingRule: Int): Unit = clipWindingRule = windingRule - override def moveTo(x: Float, y: Float) = linePath.moveTo(x, y) + override def moveTo(x: Float, y: Float): Unit = linePath.moveTo(x, y) - override def lineTo(x: Float, y: Float) = linePath.lineTo(x, y) + override def lineTo(x: Float, y: Float): Unit = linePath.lineTo(x, y) - override def curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float) = + override def curveTo(x1: Float, y1: Float, x2: Float, y2: Float, x3: Float, y3: Float): Unit = linePath.curveTo(x1, y1, x2, y2, x3, y3) override def getCurrentPoint: Point2D = linePath.getCurrentPoint - override def closePath() = linePath.closePath() + override def closePath(): Unit = linePath.closePath() - override def endPath() { + override def endPath(): Unit = { if (clipWindingRule != -1) { linePath.setWindingRule(clipWindingRule) getGraphicsState.intersectClippingPath(linePath) @@ -150,7 +151,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS linePath.reset() } - override def drawImage(pdImage: PDImage) { + override def drawImage(pdImage: PDImage): Unit = { val clipBounds = getGraphicsState.getCurrentClippingPath.getBounds if (clipBounds.getHeight * clipBounds.getWidth > 0) { val ctm: Matrix = getGraphicsState.getCurrentTransformationMatrix @@ -177,7 +178,7 @@ class GraphicBBDetector(page: PDPage, ignoreWhite: Boolean) extends PDFGraphicsS } } - override def shadingFill(shadingName: COSName) { + override def shadingFill(shadingName: COSName): Unit = { val newBound = getGraphicsState.getCurrentClippingPath.getBounds if (newBound.getWidth > 0 && newBound.getHeight > 0) { bounds = newBound :: bounds diff --git a/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala b/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala index 4ace54a..a69c68b 100644 --- a/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala +++ b/src/main/scala/org/allenai/pdffigures2/GraphicsExtractor.scala @@ -1,7 +1,6 @@ package org.allenai.pdffigures2 import org.allenai.pdffigures2.FigureExtractor.OcredPdfException - import org.apache.pdfbox.pdmodel.PDDocument object GraphicsExtractor extends Logging { diff --git a/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala b/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala index 20cf1c1..a84860e 100644 --- a/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala +++ b/src/main/scala/org/allenai/pdffigures2/InterruptiblePDFRenderer.scala @@ -1,9 +1,11 @@ package org.allenai.pdffigures2 +import org.apache.pdfbox.contentstream.operator.Operator import org.apache.pdfbox.cos.COSBase import org.apache.pdfbox.pdmodel.PDDocument -import org.apache.pdfbox.rendering.{ PDFRenderer, PageDrawer, PageDrawerParameters } -import org.apache.pdfbox.contentstream.operator.Operator +import org.apache.pdfbox.rendering.PDFRenderer +import org.apache.pdfbox.rendering.PageDrawer +import org.apache.pdfbox.rendering.PageDrawerParameters class InterruptiblePDFRenderer(doc: PDDocument) extends PDFRenderer(doc) { diff --git a/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala b/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala index 2f2be94..0762a62 100644 --- a/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala +++ b/src/main/scala/org/allenai/pdffigures2/JsonProtocol.scala @@ -1,34 +1,34 @@ package org.allenai.pdffigures2 -import org.allenai.pdffigures2.FigureExtractor.{ Document, DocumentWithSavedFigures } -import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText } - +import org.allenai.pdffigures2.FigureExtractor.Document +import org.allenai.pdffigures2.FigureExtractor.DocumentWithSavedFigures +import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection +import org.allenai.pdffigures2.SectionedTextBuilder.PdfText import spray.json._ -// From https://github.com/spray/spray-json/issues/200 -// to support enum -> json conversion -class EnumJsonConverter[T <: scala.Enumeration](enu: T) extends RootJsonFormat[T#Value] { - override def write(obj: T#Value): JsValue = JsString(obj.toString) +class FigureTypeJsonConverter extends RootJsonFormat[FigureType.Value] { + override def write(obj: FigureType.Value): JsValue = JsString(obj.toString) - override def read(json: JsValue): T#Value = { + override def read(json: JsValue): FigureType.Value = { json match { - case JsString(txt) => enu.withName(txt) - case somethingElse => throw DeserializationException(s"Expected a value from enum $enu instead of $somethingElse") + case JsString(txt) => FigureType.withName(txt) + case somethingElse => throw DeserializationException(s"Expected a value from enum FigureType instead of $somethingElse") } } } + trait JsonProtocol extends DefaultJsonProtocol { // JSON formats so we can write Figures/Captions/Documents to disk - implicit val enumConverter = new EnumJsonConverter(FigureType) - implicit val boxFormat = jsonFormat4(Box.apply) - implicit val captionFormat = jsonFormat5(Caption.apply) - implicit val figureFormat = jsonFormat7(Figure.apply) - implicit val savedFigureFormat = jsonFormat9(SavedFigure.apply) - implicit val documentTextFormat = jsonFormat3(PdfText.apply) - implicit val documentSectionFormat = jsonFormat2(DocumentSection.apply) - implicit val documentFormat = jsonFormat3(Document.apply) - implicit val documentWithFiguresFormat = jsonFormat3(DocumentWithSavedFigures.apply) + implicit val enumConverter: RootJsonFormat[FigureType.Value] = new FigureTypeJsonConverter() + implicit val boxFormat: RootJsonFormat[Box] = jsonFormat4(Box.apply) + implicit val captionFormat: RootJsonFormat[Caption] = jsonFormat5(Caption.apply) + implicit val figureFormat: RootJsonFormat[Figure] = jsonFormat7(Figure.apply) + implicit val savedFigureFormat: RootJsonFormat[SavedFigure] = jsonFormat9(SavedFigure.apply) + implicit val documentTextFormat: RootJsonFormat[PdfText] = jsonFormat3(PdfText.apply) + implicit val documentSectionFormat: RootJsonFormat[DocumentSection] = jsonFormat2(DocumentSection.apply) + implicit val documentFormat: RootJsonFormat[Document] = jsonFormat3(Document.apply) + implicit val documentWithFiguresFormat: RootJsonFormat[DocumentWithSavedFigures] = jsonFormat3(DocumentWithSavedFigures.apply) } object JsonProtocol extends JsonProtocol diff --git a/src/main/scala/org/allenai/pdffigures2/Logging.scala b/src/main/scala/org/allenai/pdffigures2/Logging.scala index 2fe008d..89dc046 100644 --- a/src/main/scala/org/allenai/pdffigures2/Logging.scala +++ b/src/main/scala/org/allenai/pdffigures2/Logging.scala @@ -7,7 +7,9 @@ import ch.qos.logback.classic.encoder.PatternLayoutEncoder import ch.qos.logback.classic.html.HTMLLayout import ch.qos.logback.classic.spi.ILoggingEvent import ch.qos.logback.core._ -import ch.qos.logback.core.encoder.{ Encoder, LayoutWrappingEncoder } +import ch.qos.logback.core.encoder.Encoder +import ch.qos.logback.core.encoder.LayoutWrappingEncoder +import org.slf4j.Logger import org.slf4j.LoggerFactory /** This trait is meant to be mixed into a class to provide logging and logging configuration. @@ -17,7 +19,7 @@ import org.slf4j.LoggerFactory * not constructed unless the message will be logged. */ trait Logging { - val internalLogger = LoggerFactory.getLogger(this.getClass) + val internalLogger: Logger = LoggerFactory.getLogger(this.getClass) object logger { // scalastyle:ignore diff --git a/src/main/scala/org/allenai/pdffigures2/PageStructure.scala b/src/main/scala/org/allenai/pdffigures2/PageStructure.scala index 8c4ae2a..cbb129a 100644 --- a/src/main/scala/org/allenai/pdffigures2/PageStructure.scala +++ b/src/main/scala/org/allenai/pdffigures2/PageStructure.scala @@ -70,8 +70,8 @@ case class PageWithBodyText( override def paragraphs: Seq[Paragraph] = (bodyText ++ otherText).sorted def nonFigureText: Seq[Paragraph] = bodyText ++ captions.map(_.paragraph) def nonFigureContent: Seq[Box] = nonFigureText.map(_.boundary) ++ nonFigureGraphics - def possibleFigureContent = graphics ++ otherText.map(_.boundary) - def allContent = possibleFigureContent ++ nonFigureContent + def possibleFigureContent: Seq[Box] = graphics ++ otherText.map(_.boundary) + def allContent: Seq[Box] = possibleFigureContent ++ nonFigureContent } case class PageWithFigures( diff --git a/src/main/scala/org/allenai/pdffigures2/Paragraph.scala b/src/main/scala/org/allenai/pdffigures2/Paragraph.scala index 7010f13..7cc55e0 100644 --- a/src/main/scala/org/allenai/pdffigures2/Paragraph.scala +++ b/src/main/scala/org/allenai/pdffigures2/Paragraph.scala @@ -3,6 +3,7 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.text.TextPosition import java.text.Normalizer +import scala.util.matching.Regex /** Span of text denoted by the starting and ending line number, inclusive */ case class TextSpan(start: Int, end: Int) extends Ordered[TextSpan] { @@ -12,7 +13,7 @@ case class TextSpan(start: Int, end: Int) extends Ordered[TextSpan] { } object Paragraph { - val unprintableRegex = """[\p{Cc}\p{Cf}\p{Co}\p{Cn}]""".r + val unprintableRegex: Regex = """[\p{Cc}\p{Cf}\p{Co}\p{Cn}]""".r def apply(lines: List[Line]): Paragraph = Paragraph(lines, Box.container(lines.map(_.boundary))) diff --git a/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala b/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala index a253c87..4aba88f 100644 --- a/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala +++ b/src/main/scala/org/allenai/pdffigures2/SectionTitleExtractor.scala @@ -181,7 +181,7 @@ object SectionTitleExtractor extends Logging { def build(line: Line): SectionTitle = { val fountCounts = line.words.flatMap(w => w.positions.map(_.getFont)).groupBy(identity) - val mostCommonFont = fountCounts.mapValues(_.size).maxBy(_._2)._1 + val mostCommonFont = fountCounts.view.mapValues(_.size).maxBy(_._2)._1 val fontSizes = line.words.flatMap(w => w.positions.map(_.getFontSizeInPt)) val medianFontSize = fontSizes.sorted.drop(fontSizes.size / 2).head SectionTitle(List(line), line.boundary, isPrefixed(line), mostCommonFont, medianFontSize) @@ -235,7 +235,7 @@ object SectionTitleExtractor extends Logging { ): Seq[PageWithClassifiedText] = { val (strippedTextPages, sectionHeaders) = stripSectionTitlesFromSortedParagraphs(pages.map(_.paragraphs), layout).unzip - (pages, strippedTextPages, sectionHeaders).zipped.map { + pages.lazyZip(strippedTextPages).lazyZip(sectionHeaders).map { case (page, strippedText, pageSectionHeaders) => PageWithClassifiedText( page.pageNumber, diff --git a/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala b/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala index 67c579f..8eff012 100644 --- a/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala +++ b/src/main/scala/org/allenai/pdffigures2/SectionedTextBuilder.scala @@ -40,9 +40,9 @@ object SectionedTextBuilder { * @param paragraphs section text broken up into paragraphs */ case class DocumentSection(title: Option[PdfText], paragraphs: Seq[PdfText]) { - def titleText = title.map(_.text) - def paragraphsText = paragraphs.map(_.text) - def bodyText = paragraphsText.mkString("\n") + def titleText: Option[String] = title.map(_.text) + def paragraphsText: Seq[String] = paragraphs.map(_.text) + def bodyText: String = paragraphsText.mkString("\n") } @tailrec diff --git a/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala b/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala index 75fd76d..3872b77 100644 --- a/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala +++ b/src/main/scala/org/allenai/pdffigures2/TextExtractor.scala @@ -1,12 +1,14 @@ package org.allenai.pdffigures2 import org.apache.pdfbox.cos.COSBase -import org.apache.pdfbox.pdmodel.common.PDRectangle -import org.apache.pdfbox.pdmodel.{ PDDocument, PDPage } -import org.apache.pdfbox.text.{ PDFTextStripper, TextPosition } +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.pdmodel.PDPage +import org.apache.pdfbox.text.PDFTextStripper +import org.apache.pdfbox.text.TextPosition import java.io.Writer -import scala.collection.{ immutable, mutable } +import scala.collection.immutable +import scala.collection.mutable object TextExtractor { @@ -140,7 +142,7 @@ private class TextExtractor extends PDFTextStripper with Logging { // PDFBox can occasionally wildly overestimate the height of text, so if things look really // wrong we clip the text to a sensible amount val height = if (pos.getHeight > TextExtractor.MinHeightToClipText) { - TextExtractor.HeightToCLipTextTo + TextExtractor.HeightToCLipTextTo.toFloat } else { pos.getHeight } diff --git a/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala b/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala index d742cb6..730ea19 100644 --- a/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala +++ b/src/main/scala/org/allenai/pdffigures2/VisualLogger.scala @@ -3,12 +3,12 @@ package org.allenai.pdffigures2 import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection import org.apache.pdfbox.pdmodel.PDDocument import org.apache.pdfbox.rendering.PDFRenderer -import java.awt.event.{ ActionEvent, KeyEvent } + import java.awt._ +import java.awt.event.ActionEvent +import java.awt.event.KeyEvent import java.awt.image.BufferedImage -import javax.imageio.ImageIO -import java.io.File -import javax.swing._ +import javax.swing.{Box => _, _} case class Annotations( boxes: Seq[Box], @@ -56,7 +56,7 @@ class VisualLogger( val SectionsKey = "Sections" // Ordered in how they will be shown to the user - val ReservedKeys = Seq( + val ReservedKeys: Seq[String] = Seq( GraphicsClusterKey, TextAndGraphicsExtractionKey, CaptionLocationKey, @@ -89,7 +89,7 @@ class VisualLogger( if (pagesToShow.nonEmpty) { val renderer = new PDFRenderer(doc) val visualizationPerPage = pagesToShow.map { pageNum => - val pageImg = renderer.renderImageWithDPI(pageNum, dpi) + val pageImg = renderer.renderImageWithDPI(pageNum, dpi.toFloat) val imagesToShow = keysToShow.map { key => val annotations = logs(key).getOrElse(pageNum, Seq()) val img = cloneImage(pageImg) @@ -99,7 +99,7 @@ class VisualLogger( val dash = if (annotation.dashed) Array[Float](2) else null g.setStroke( new BasicStroke( - annotation.thickness, + annotation.thickness.toFloat, BasicStroke.CAP_BUTT, BasicStroke.JOIN_BEVEL, 0.0f, @@ -177,13 +177,13 @@ class VisualLogger( // So our frame can be closed by hot key, on OSX this means at least Cmd-W works val closeKey = KeyStroke.getKeyStroke( KeyEvent.VK_W, - Toolkit.getDefaultToolkit.getMenuShortcutKeyMask + Toolkit.getDefaultToolkit.getMenuShortcutKeyMaskEx ) panel.getInputMap.put(closeKey, "closeWindow") panel.getActionMap.put( "closeWindow", new AbstractAction("Close Window") { - override def actionPerformed(e: ActionEvent) { + override def actionPerformed(e: ActionEvent): Unit = { frame.setVisible(false) frame.dispose() } diff --git a/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala b/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala index d79da7d..fce800d 100644 --- a/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala +++ b/src/test/scala/org/allenai/pdffigures2/TestExtractionFilters.scala @@ -1,6 +1,7 @@ package org.allenai.pdffigures2 -import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.Loader +import org.apache.pdfbox.io.RandomAccessReadBuffer import org.scalatest.funsuite.AnyFunSuite /** These tests verify that figure extraction filters are successfully catching and removing bad @@ -26,9 +27,11 @@ class TestExtractionFilters extends AnyFunSuite { * These extractions should be filtered out for being too close to the page boundary. */ test("Page boundary filter should filter out bad extractions") { - val pdf = PDDocument.load( - getClass.getClassLoader.getResourceAsStream( - "test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf" + val pdf = Loader.loadPDF( + new RandomAccessReadBuffer( + getClass.getClassLoader.getResourceAsStream( + "test-pdfs/f63cb20759fab2514802c3ef2a743c76bf9dc9f1.pdf" + ) ) ) val figures = extractor.getFigures(pdf) @@ -41,9 +44,11 @@ class TestExtractionFilters extends AnyFunSuite { * This extraction should be filtered out for splitting a figure. */ test("Graphics split filter should filter out bad extractions") { - val pdf = PDDocument.load( - getClass.getClassLoader.getResourceAsStream( - "test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf" + val pdf = Loader.loadPDF( + new RandomAccessReadBuffer( + getClass.getClassLoader.getResourceAsStream( + "test-pdfs/3a9202f9f176d3377516e3da0866cc19148c033b.pdf" + ) ) ) val figures = extractor.getFigures(pdf, pages = Some(Seq(6))) @@ -54,24 +59,26 @@ class TestExtractionFilters extends AnyFunSuite { * This ensures that when figures are empty, it's not because figure extraction is broken. */ test("Figures should all be extracted") { - val pdf = PDDocument.load( - getClass.getClassLoader.getResourceAsStream( - "test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf" + val pdf = Loader.loadPDF( + new RandomAccessReadBuffer( + getClass.getClassLoader.getResourceAsStream( + "test-pdfs/498bb0efad6ec15dd09d941fb309aa18d6df9f5f.pdf" + ) ) ) val figures = extractor.getFigures(pdf).toList assert(figures.length === 2) - assert(figures(1).figType === FigureType.Table) - assert(figures(1).name === "1") - assert(figures(1).page === 4) - assert( - figures(1).caption === "Table 1: Over a set of ten relations, TEXTRUNNER achieved a 33% lower error rate than KNOWITALL, while finding approximately as many correct extractions." - ) - assert(figures(0).figType === FigureType.Figure) + assert(figures(0).figType === FigureType.Table) assert(figures(0).name === "1") assert(figures(0).page === 4) assert( - figures(0).caption === "Figure 1: Overview of the tuples extracted from 9 million Web page corpus. 7.8 million well-formed tuples are found having probability ≥ 0.8. Of those, TEXTRUNNER finds 1 million concrete tuples with arguments grounded in particular real-world entities, 88.1% of which are correct, and 6.8 million tuples reflecting abstract assertions, 79.2% of which are correct." + figures(0).caption === "Table 1: Over a set of ten relations, TEXTRUNNER achieved a 33% lower error rate than KNOWITALL, while finding approximately as many correct extractions." + ) + assert(figures(1).figType === FigureType.Figure) + assert(figures(1).name === "1") + assert(figures(1).page === 4) + assert( + figures(1).caption === "Figure 1: Overview of the tuples extracted from 9 million Web page corpus. 7.8 million well-formed tuples are found having probability ≥ 0.8. Of those, TEXTRUNNER finds 1 million concrete tuples with arguments grounded in particular real-world entities, 88.1% of which are correct, and 6.8 million tuples reflecting abstract assertions, 79.2% of which are correct." ) } }