From c14309a2c065459d7e3ce03f11170e65364577c0 Mon Sep 17 00:00:00 2001 From: Gus Hahn-Powell Date: Thu, 10 Feb 2022 01:00:55 -0700 Subject: [PATCH 1/4] Paginate using Lucene Doc ID No need to use an OdinsonScoreDoc when a Doc ID will suffice --- .../ai/lum/odinson/ExtractorEngine.scala | 29 +++++-------------- .../index/IncrementalOdinsonIndex.scala | 4 +-- .../index/OdinsonCollectorManager.scala | 2 +- .../odinson/lucene/index/OdinsonIndex.scala | 2 +- .../lucene/search/OdinsonCollector.scala | 14 --------- .../lucene/search/OdinsonIndexSearcher.scala | 12 ++++---- .../scala/ai/lum/odinson/extra/Shell.scala | 15 ++++++---- 7 files changed, 28 insertions(+), 50 deletions(-) diff --git a/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala b/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala index 2a6b503e..37a6cb47 100644 --- a/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala +++ b/core/src/main/scala/ai/lum/odinson/ExtractorEngine.scala @@ -136,20 +136,7 @@ class ExtractorEngine private ( * @return */ def query(odinsonQuery: OdinsonQuery, n: Int, disableMatchSelector: Boolean): OdinResults = { - query(odinsonQuery, n, null, disableMatchSelector) - } - - /** Executes an OdinsonQuery and returns an OdinResult - * with the next `n` matched lucene documents and their corresponding matches, - * starting after the last lucene document in the OdinResults `after`. - * - * @param odinsonQuery - * @param n number of desired lucene documents - * @param after an OdinResults with a set of lucene documents - * @return - */ - def query(odinsonQuery: OdinsonQuery, n: Int, after: OdinResults): OdinResults = { - query(odinsonQuery, n, after.scoreDocs.last) + query(odinsonQuery, n, -1, disableMatchSelector) } /** Executes an OdinsonQuery and returns an OdinResult @@ -158,10 +145,10 @@ class ExtractorEngine private ( * * @param odinsonQuery * @param n number of desired lucene documents - * @param after the last lucene document to ignore + * @param after the ID of the last lucene document to ignore * @return */ - def query(odinsonQuery: OdinsonQuery, n: Int, after: OdinsonScoreDoc): OdinResults = { + def query(odinsonQuery: OdinsonQuery, n: Int, after: Int): OdinResults = { query(odinsonQuery, n, after, false) } @@ -185,7 +172,7 @@ class ExtractorEngine private ( def query( odinsonQuery: OdinsonQuery, n: Int, - after: OdinsonScoreDoc, + after: Int, disableMatchSelector: Boolean ): OdinResults = { val odinResults = @@ -203,7 +190,7 @@ class ExtractorEngine private ( } private def odinSearch( - after: OdinsonScoreDoc, + after: Int, query: OdinsonQuery, numHits: Int, disableMatchSelector: Boolean @@ -211,8 +198,8 @@ class ExtractorEngine private ( val limit = math.max(1, index.maxDoc()) require( - after == null || after.doc < limit, - s"after.doc exceeds the number of documents in the reader: after.doc=${after.doc} limit=${limit}" + after < limit, + s"after exceeds the number of documents in the reader: after=${after} limit=${limit}" ) val cappedNumHits = math.min(numHits, limit) index.search(after, query, cappedNumHits, disableMatchSelector) @@ -350,7 +337,7 @@ class ExtractorEngine private ( disableMatchSelector: Boolean, mruIdGetter: MostRecentlyUsed[Int, LazyIdGetter] ): Iterator[Mention] = { - val odinResults = query(extractor.query, numSentences, null, disableMatchSelector) + val odinResults = query(extractor.query, numSentences, -1, disableMatchSelector) new MentionsIterator( extractor.label, Some(extractor.name), diff --git a/core/src/main/scala/ai/lum/odinson/lucene/index/IncrementalOdinsonIndex.scala b/core/src/main/scala/ai/lum/odinson/lucene/index/IncrementalOdinsonIndex.scala index 6302f550..ae1fa5d1 100644 --- a/core/src/main/scala/ai/lum/odinson/lucene/index/IncrementalOdinsonIndex.scala +++ b/core/src/main/scala/ai/lum/odinson/lucene/index/IncrementalOdinsonIndex.scala @@ -265,13 +265,13 @@ class IncrementalOdinsonIndex( } override def search( - scoreDoc: OdinsonScoreDoc, + luceneDocId: Int, query: OdinsonQuery, cappedHits: Int, disableMatchSelector: Boolean ): OdinResults = { val manager = - new OdinsonCollectorManager(scoreDoc, cappedHits, computeTotalHits, disableMatchSelector) + new OdinsonCollectorManager(luceneDocId, cappedHits, computeTotalHits, disableMatchSelector) this.search(query, manager) } diff --git a/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonCollectorManager.scala b/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonCollectorManager.scala index 860e726b..d5cf36c2 100644 --- a/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonCollectorManager.scala +++ b/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonCollectorManager.scala @@ -8,7 +8,7 @@ import java.util.Collection import scala.collection.JavaConverters._ class OdinsonCollectorManager( - after: OdinsonScoreDoc, + after: Int, cappedNumHits: Int, computeTotalHits: Boolean, disableMatchSelector: Boolean diff --git a/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonIndex.scala b/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonIndex.scala index 51c260a1..7094c4cb 100644 --- a/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonIndex.scala +++ b/core/src/main/scala/ai/lum/odinson/lucene/index/OdinsonIndex.scala @@ -116,7 +116,7 @@ trait OdinsonIndex { ): ResultType def search( - scoreDoc: OdinsonScoreDoc, + luceneDocId: Int, query: OdinsonQuery, cappedHits: Int, disableMatchSelector: Boolean diff --git a/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonCollector.scala b/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonCollector.scala index 4dc377cc..b6c29cdb 100644 --- a/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonCollector.scala +++ b/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonCollector.scala @@ -23,20 +23,6 @@ class OdinsonCollector( this(numHits, -1, computeTotalHits, disableMatchSelector) } - def this( - numHits: Int, - afterDoc: OdinsonScoreDoc, - computeTotalHits: Boolean, - disableMatchSelector: Boolean - ) = { - this( - numHits, - if (afterDoc == null) -1 else afterDoc.doc, - computeTotalHits, - disableMatchSelector - ) - } - private var totalHits: Int = 0 private var collectedHits: Int = 0 diff --git a/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonIndexSearcher.scala b/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonIndexSearcher.scala index f2653517..02a68fc2 100644 --- a/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonIndexSearcher.scala +++ b/core/src/main/scala/ai/lum/odinson/lucene/search/OdinsonIndexSearcher.scala @@ -34,15 +34,15 @@ class OdinsonIndexSearcher( } def odinSearch(query: OdinsonQuery, n: Int): OdinResults = { - odinSearch(null, query, n) + odinSearch(-1, query, n) } - def odinSearch(after: OdinsonScoreDoc, query: OdinsonQuery, numHits: Int): OdinResults = { + def odinSearch(after: Int, query: OdinsonQuery, numHits: Int): OdinResults = { odinSearch(after, query, numHits, false) } class StandardCollectorManager( - after: OdinsonScoreDoc, + after: Int, cappedNumHits: Int, disableMatchSelector: Boolean ) extends CollectorManager[OdinsonCollector, OdinResults] { @@ -60,15 +60,15 @@ class OdinsonIndexSearcher( } def odinSearch( - after: OdinsonScoreDoc, + after: Int, query: OdinsonQuery, numHits: Int, disableMatchSelector: Boolean ): OdinResults = { val limit = math.max(1, readerContext.reader().maxDoc()) require( - after == null || after.doc < limit, - s"after.doc exceeds the number of documents in the reader: after.doc=${after.doc} limit=${limit}" + after < limit, + s"after exceeds the number of documents in the reader: after=${after} limit=${limit}" ) val cappedNumHits = math.min(numHits, limit) val manager = new StandardCollectorManager(after, cappedNumHits, disableMatchSelector) diff --git a/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala b/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala index b3a2324a..000db453 100644 --- a/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala +++ b/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala @@ -78,7 +78,7 @@ object Shell extends App { val matchTextToParse = """^:mkDoc\s+(.*)""".r var query: String = null - var after: OdinsonScoreDoc = null + var after: Int = -1 var shownHits: Int = 0 var totalHits: Int = 0 @@ -171,13 +171,18 @@ object Shell extends App { println(json) } + def getLastDocId(scoreDocs: Array[OdinsonScoreDoc]): Int = scoreDocs.lastOption match { + case Some(sd) => sd.doc + case _ => -1 + } + /** searches for pattern and prints the first n matches */ def search(n: Int): Unit = { val start = System.currentTimeMillis() val q = extractorEngine.compiler.mkQuery(query) val results = extractorEngine.query(q, n) val duration = (System.currentTimeMillis() - start) / 1000f - after = results.scoreDocs.lastOption.getOrElse(null) + after = getLastDocId(results.scoreDocs) totalHits = results.totalHits shownHits = math.min(n, totalHits) printResultsPage(results, 1, totalHits, duration) @@ -185,7 +190,7 @@ object Shell extends App { /** prints the next n matches */ def printMore(n: Int): Unit = { - if (after == null) { + if (after == -1) { println("there is no active query") return } @@ -197,8 +202,8 @@ object Shell extends App { val q = extractorEngine.compiler.mkQuery(query) val results = extractorEngine.query(q, n, after) val duration = (System.currentTimeMillis() - start) / 1000f - after = results.scoreDocs.lastOption.getOrElse(null) - if (after == null) { + after = getLastDocId(results.scoreDocs) + if (after == -1) { println("no more results") return } From a89c10a3458b7155ad42f47029bd3d7594527af7 Mon Sep 17 00:00:00 2001 From: Gus Hahn-Powell Date: Thu, 10 Feb 2022 01:06:40 -0700 Subject: [PATCH 2/4] Correct test to use Lucene Doc ID-based pagination --- .../src/test/scala/ai/lum/odinson/events/TestEvents.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala b/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala index ebcd773e..146c1fcc 100644 --- a/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala +++ b/core/src/test/scala/ai/lum/odinson/events/TestEvents.scala @@ -1,7 +1,7 @@ package ai.lum.odinson.events import ai.lum.odinson.lucene.OdinResults -import ai.lum.odinson.lucene.search.{ OdinsonQuery, OdinsonScoreDoc } +import ai.lum.odinson.lucene.search.OdinsonQuery import ai.lum.odinson.test.utils.OdinsonTest import ai.lum.odinson.utils.exceptions.OdinsonException import ai.lum.odinson.{ EventMatch, MentionsIterator } @@ -197,7 +197,7 @@ class TestEvents extends OdinsonTest { labelOpt: Option[String] = None, nameOpt: Option[String] = None, n: Int, - after: OdinsonScoreDoc, + after: Int, disableMatchSelector: Boolean ): OdinResults = { val odinResults = ee.query(odinsonQuery, n, after, disableMatchSelector) @@ -214,14 +214,14 @@ class TestEvents extends OdinsonTest { labelOpt = Some("NP"), nameOpt = None, 1, - after = null, + after = -1, disableMatchSelector = false ) results1.totalHits should equal(1) results1.scoreDocs.head.matches should have size 2 // This query only needs to read from the state. - val results2 = ee.query(q2, 1, after = null, disableMatchSelector = false) + val results2 = ee.query(q2, 1, after = -1, disableMatchSelector = false) results2.totalHits should equal(1) results2.scoreDocs.head.matches should have size 1 From 274027881c2d0ba97f14045ad1e7a8898dcd20fb Mon Sep 17 00:00:00 2001 From: Gus Hahn-Powell Date: Thu, 10 Feb 2022 01:21:16 -0700 Subject: [PATCH 3/4] Formatting --- extra/src/main/scala/ai/lum/odinson/extra/Shell.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala b/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala index 000db453..9524f111 100644 --- a/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala +++ b/extra/src/main/scala/ai/lum/odinson/extra/Shell.scala @@ -173,7 +173,7 @@ object Shell extends App { def getLastDocId(scoreDocs: Array[OdinsonScoreDoc]): Int = scoreDocs.lastOption match { case Some(sd) => sd.doc - case _ => -1 + case _ => -1 } /** searches for pattern and prints the first n matches */ From c222232d52335c7d58695569a8ec18e3da6a76ae Mon Sep 17 00:00:00 2001 From: Gus Hahn-Powell Date: Thu, 10 Feb 2022 01:21:35 -0700 Subject: [PATCH 4/4] Use a non-RC version of SBT (resolves #369) --- project/build.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project/build.properties b/project/build.properties index 9cb72522..3161d214 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.6.0-RC1 +sbt.version=1.6.1