allenai · jennifgcrl · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,8 @@ evaluation/datasets/**/*.jpeg
 evaluation/datasets/**/*.jpg
 evaluation/datasets/**/*.pgm
 
-*.pkl
+*.pkl
+
+/.bsp
+
+*.jar
diff --git a/.scalafix.conf b/.scalafix.conf
@@ -0,0 +1,12 @@
+rules = [
+  ExplicitResultTypes,       # Inserts type annotations for inferred public members.
+  NoAutoTupling,             # Inserts explicit tuples for adapted argument lists for compatibility with -Yno-adapted-args.
+  OrganizeImports,           # Organizes import statements.
+  RemoveUnused,              # Removes unused imports and terms reported by the compiler under -Wunused.
+  DisableSyntax,             # Reports an error for disabled features such as var or XML literals.
+  LeakingImplicitClassVal,   # Adds 'private' to val parameters of implicit value classes.
+  NoValInForComprehension,   # Removes deprecated val inside for-comprehension binders.
+  ProcedureSyntax,           # Replaces deprecated Scala 2.x procedure syntax with explicit ': Unit ='.
+  RedundantSyntax            # Removes redundant syntax such as `final` modifiers on an object.
+]
+
diff --git a/build.sbt b/build.sbt
@@ -1,14 +1,19 @@
-lazy val scala211 = "2.11.12"
-lazy val scala212 = "2.12.16"
-lazy val scala213 = "2.13.8" // Not supported yet (collections changes required in common)
-lazy val supportedScalaVersions = List(scala212, scala211)
+lazy val scala212 = "2.12.20"
+lazy val scala213 = "2.13.15"
+lazy val scala35 = "3.5.2"
+lazy val supportedScalaVersions = List(scala212, scala213, scala35)
 
 Global / onChangedBuildSource := ReloadOnSourceChanges
 
 ThisBuild / organization := "org.allenai"
 ThisBuild / description  := "Scala library to extract figures, tables, and captions from scholarly documents"
-ThisBuild / scalaVersion := scala212
+ThisBuild / scalaVersion := scala35
 ThisBuild / version      := "0.1.0"
+ThisBuild / semanticdbEnabled := true
+
+scalacOptions ++= Seq(
+  "-Wunused:all",
+)
 
 lazy val projectSettings = Seq(
   name := "pdffigures2",
@@ -25,13 +30,14 @@ lazy val projectSettings = Seq(
   bintrayOrganization := Some("allenai"),
   bintrayRepository := "maven",
   libraryDependencies ++= Seq(
+    "org.scala-lang.modules" %% "scala-parallel-collections" % "1.0.4",
     "io.spray" %% "spray-json" % "1.3.6",
     "com.github.scopt" %% "scopt" % "4.1.0",
-    "ch.qos.logback" % "logback-classic" % "1.2.11",
-    "org.slf4j" % "jcl-over-slf4j" % "1.7.36",
-    "org.apache.pdfbox" % "pdfbox" % "2.0.26",
-    "org.apache.pdfbox" % "fontbox" % "2.0.26",
-    "com.typesafe" % "config" % "1.4.2",
+    "ch.qos.logback" % "logback-classic" % "1.5.6",
+    "org.slf4j" % "jcl-over-slf4j" % "2.0.13",
+    "org.apache.pdfbox" % "pdfbox" % "3.0.2",
+    "org.apache.pdfbox" % "fontbox" % "3.0.2",
+    "com.typesafe" % "config" % "1.4.3",
 
     // So PDFBox can parse more image formats
     // These are disabled by default, because they are not licensed flexibly enough.
@@ -40,11 +46,11 @@ lazy val projectSettings = Seq(
 //    "com.levigo.jbig2" % "levigo-jbig2-imageio" % "2.0", // For handling jbig2 images
 
     // So PDFBox can parse security enabled but still readable PDFs
-    "org.bouncycastle" % "bcprov-jdk18on" % "1.71",
-    "org.bouncycastle" % "bcmail-jdk18on" % "1.71",
-    "org.bouncycastle" % "bcpkix-jdk18on" % "1.71",
+    "org.bouncycastle" % "bcprov-jdk18on" % "1.78.1",
+    "org.bouncycastle" % "bcmail-jdk18on" % "1.78.1",
+    "org.bouncycastle" % "bcpkix-jdk18on" % "1.78.1",
 
-    "org.scalatest" %% "scalatest" % "3.2.13" % Test
+    "org.scalatest" %% "scalatest" % "3.2.19" % Test
   ),
 
   pomExtra :=
@@ -61,11 +67,13 @@ lazy val root = (project in file("."))
     .settings(projectSettings)
 
 Compile / run / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
+Compile / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
 assembly / mainClass := Some("org.allenai.pdffigures2.FigureExtractorBatchCli")
 assembly / assemblyOutputPath := file("pdffigures2.jar")
 
 assembly / assemblyMergeStrategy := {
   case x if x.endsWith("module-info.class") => MergeStrategy.discard
+  case PathList("META-INF", "versions", "9", "OSGI-INF", "MANIFEST.MF") => MergeStrategy.first
   case PathList("org", "apache", "commons", xs @ _*) => MergeStrategy.first
   case x =>
     val oldStrategy = (assembly / assemblyMergeStrategy).value

diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.7.1
+sbt.version=1.10.5
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -3,3 +3,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
 addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")
 
 addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.6.1")
+
+addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0")
+
diff --git a/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala b/src/main/scala/org/allenai/pdffigures2/CaptionDetector.scala
@@ -1,6 +1,7 @@
 package org.allenai.pdffigures2
 
 import org.apache.pdfbox.pdmodel.font.PDFont
+
 import FigureType._
 
 case class CaptionStart(
@@ -14,12 +15,12 @@ case class CaptionStart(
   paragraphStart: Boolean,
   lineEnd: Boolean
 ) {
-  val figId = (figType, name)
-  val colonMatch = numberSyntax == ":"
-  val periodMatch = numberSyntax == "."
-  val allCapsFig = header.startsWith("FIG")
-  val allCapsTable = header == "TABLE"
-  val figAbbreviated = header == "Fig."
+  val figId: (FigureType.FigureType, String) = (figType, name)
+  val colonMatch: Boolean = numberSyntax == ":"
+  val periodMatch: Boolean = numberSyntax == "."
+  val allCapsFig: Boolean = header.startsWith("FIG")
+  val allCapsTable: Boolean = header == "TABLE"
+  val figAbbreviated: Boolean = header == "Fig."
 }
 
 object CaptionDetector extends Logging {
@@ -54,7 +55,7 @@ object CaptionDetector extends Logging {
     standardFont: PDFont,
     types: Set[FigureType]
   ) extends CandidateFilter {
-    val name = s"Non Standard Font: ${types.toList}"
+    val name: String = s"Non Standard Font: ${types.toList}"
     def accept(cc: CaptionStart): Boolean =
       !types.contains(cc.figType) ||
         cc.line.words.head.positions.head.getFont != standardFont
@@ -91,7 +92,7 @@ object CaptionDetector extends Logging {
   }
 
   private case class LeftAlignedOnly(figureOnly: Boolean) extends CandidateFilter {
-    val name = "Left Aligned" + (if (figureOnly) " Figures" else "")
+    val name: String = "Left Aligned" + (if (figureOnly) " Figures" else "")
     def accept(cc: CaptionStart): Boolean = {
       figureOnly && cc.figType == FigureType.Table || (if (cc.nextLine.isDefined) {
                                                          Math.abs(
@@ -255,7 +256,7 @@ object CaptionDetector extends Logging {
         }
         if (!removedAny) {
           logger.debug(
-            s"Filtered for paragraph starts, " +
+            "Filtered for paragraph starts, " +
               s"${groupedById.values.map(_.size).sum} remaining"
           )
         }

diff --git a/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala b/src/main/scala/org/allenai/pdffigures2/DocumentLayout.scala
@@ -2,7 +2,8 @@ package org.allenai.pdffigures2
 
 import org.apache.pdfbox.pdmodel.font.PDFont
 
-import scala.collection.{ immutable, mutable }
+import scala.collection.immutable
+import scala.collection.mutable
 
 /** Store some statistics about the document as a whole */
 case class DocumentLayout(

diff --git a/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala b/src/main/scala/org/allenai/pdffigures2/FigureDetector.scala
@@ -1,5 +1,7 @@
 package org.allenai.pdffigures2
 
+import scala.annotation.unused
+
 object FigureDetector {
 
   private val MinProposalHeight = 15
@@ -232,7 +234,7 @@ object FigureDetector {
     */
   // TODO it would be nice to be able to do this for downwards proposals as well
   private def clipUpwardRegion(
-    caption: Box,
+    @unused caption: Box,
     region: Box,
     graphics: Seq[Box],
     otherText: Seq[Paragraph]
@@ -277,7 +279,7 @@ object FigureDetector {
   private def scoreProposal(
     proposal: Proposal,
     graphics: Seq[Box],
-    otherText: Seq[Box],
+    @unused otherText: Seq[Box],
     otherProposals: Seq[Proposal],
     bounds: Box
   ): Option[Double] = {
@@ -536,7 +538,7 @@ object FigureDetector {
   def locatedFigures(
     page: PageWithBodyText,
     layout: DocumentLayout,
-    log: Option[VisualLogger]
+    @unused log: Option[VisualLogger]
   ): PageWithFigures = {
     val proposals = buildProposals(page, layout)
     val proposalsWithCaptions = page.captions.zip(proposals)
@@ -560,7 +562,7 @@ object FigureDetector {
     } else {
       val bestConfiguration = cartesianProduct(validProposals.toList).view.zipWithIndex
         .map {
-          case (proposalsToUse, index) =>
+          case (proposalsToUse, _) =>
             var props = splitProposals(proposalsToUse, allContent).toList
             var scored = List[Proposal]()
             var scores = List[Option[Double]]()

diff --git a/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala b/src/main/scala/org/allenai/pdffigures2/FigureExtractor.scala
@@ -1,13 +1,12 @@
 package org.allenai.pdffigures2
 
-import org.allenai.pdffigures2.FigureExtractor.{
-  Document,
-  DocumentContent,
-  DocumentWithRasterizedFigures
-}
-import org.allenai.pdffigures2.SectionedTextBuilder.{ DocumentSection, PdfText }
-
-import com.typesafe.config.ConfigFactory
+import org.allenai.pdffigures2.FigureExtractor.Document
+import org.allenai.pdffigures2.FigureExtractor.DocumentContent
+import org.allenai.pdffigures2.FigureExtractor.DocumentWithRasterizedFigures
+import org.allenai.pdffigures2.SectionedTextBuilder.DocumentSection
+import org.allenai.pdffigures2.SectionedTextBuilder.PdfText
+import org.apache.pdfbox.Loader
+import org.apache.pdfbox.io.RandomAccessReadBuffer
 import org.apache.pdfbox.pdmodel.PDDocument
 
 import java.io.InputStream
@@ -209,9 +208,9 @@ object FigureExtractor {
     pagesWithFigures: Seq[PageWithFigures],
     pagesWithoutFigures: Seq[PageWithClassifiedText]
   ) {
-    val pages = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
-    def figures = pagesWithFigures.flatMap(_.figures)
-    def failedCaptions = pagesWithFigures.flatMap(_.failedCaptions)
+    val pages: Seq[ClassifiedPage] = (pagesWithFigures ++ pagesWithoutFigures).sortBy(_.pageNumber)
+    def figures: Seq[Figure] = pagesWithFigures.flatMap(_.figures)
+    def failedCaptions: Seq[Caption] = pagesWithFigures.flatMap(_.failedCaptions)
     require(pages.head.pageNumber == 0, "Must start with page number 0")
     require(
       pages
@@ -239,9 +238,9 @@ object FigureExtractor {
     private val figureExtractor = new FigureExtractor(true, true, true, true, true)
 
     def fromInputStream(is: InputStream): Document =
-      fromPDDocument(PDDocument.load(is))
+      fromPDDocument(Loader.loadPDF(new RandomAccessReadBuffer(is)))
 
-    def fromPDDocument(pdDocument: PDDocument) =
+    def fromPDDocument(pdDocument: PDDocument): Document =
       figureExtractor.getFiguresWithText(pdDocument)
   }
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,4 +9,8 @@ evaluation/datasets/**/*.jpeg @@
     evaluation/datasets/**/*.jpg
     evaluation/datasets/**/*.pgm
-    *.pkl
+    *.pkl
+    /.bsp
+    *.jar
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,6 @@ addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0")
		addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")

		addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.6.1")

		addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.13.0")