2727import java .nio .file .Path ;
2828import java .util .ArrayList ;
2929import java .util .Collection ;
30- import java .util .HashMap ;
3130import java .util .HashSet ;
3231import java .util .List ;
3332import java .util .Map ;
3433import java .util .Set ;
34+ import java .util .concurrent .ConcurrentHashMap ;
3535import java .util .logging .Level ;
3636import java .util .logging .Logger ;
3737
4141import org .apache .lucene .index .IndexReader ;
4242import org .apache .lucene .index .IndexableField ;
4343import org .apache .lucene .index .MultiBits ;
44- import org .apache .lucene .index .MultiTerms ;
4544import org .apache .lucene .index .SegmentInfos ;
46- import org .apache .lucene .index .Terms ;
47- import org .apache .lucene .index .TermsEnum ;
4845import org .apache .lucene .store .Directory ;
4946import org .apache .lucene .store .FSDirectory ;
5047import org .apache .lucene .store .LockFactory ;
5148import org .apache .lucene .store .NativeFSLockFactory ;
5249import org .apache .lucene .store .NoLockFactory ;
5350import org .apache .lucene .util .Bits ;
54- import org .apache .lucene .util .BytesRef ;
5551import org .apache .lucene .util .Version ;
5652import org .jetbrains .annotations .NotNull ;
53+ import org .jetbrains .annotations .Nullable ;
5754import org .opengrok .indexer .configuration .Configuration ;
5855import org .opengrok .indexer .logger .LoggerFactory ;
5956import org .opengrok .indexer .search .QueryBuilder ;
@@ -109,6 +106,12 @@ public static class IndexDocumentException extends Exception {
109106
110107 private final Map <String , Integer > fileMap ;
111108
109+ public IndexDocumentException (String s ) {
110+ super (s );
111+
112+ this .fileMap = null ;
113+ }
114+
112115 public IndexDocumentException (String s , Map <String , Integer > fileMap ) {
113116 super (s );
114117
@@ -117,7 +120,7 @@ public IndexDocumentException(String s, Map<String, Integer> fileMap) {
117120
118121 @ Override
119122 public String toString () {
120- return getMessage () + ": " + fileMap ;
123+ return getMessage () + ": " + ( fileMap == null ? "" : fileMap ) ;
121124 }
122125 }
123126
@@ -214,7 +217,7 @@ public static void checkDir(Path indexPath, IndexCheckMode mode, String projectN
214217 }
215218
216219 if (mode .ordinal () >= IndexCheckMode .DOCUMENTS .ordinal ()) {
217- checkDuplicateDocuments (indexPath , projectName );
220+ checkDuplicateDocuments (indexPath );
218221 }
219222 }
220223
@@ -257,67 +260,71 @@ public static Set<String> getDeletedUids(Path indexPath) throws IOException {
257260
258261 /**
259262 * @param indexPath path to index
260- * @param projectName project name, can be empty
261- * @return list of live document paths
263+ * @return list of live document paths (some of them can be duplicate if the index is corrupted)
264+ * or {@code null} if live documents cannot be retrieved.
262265 * @throws IOException on I/O error
263266 */
264- public static List <String > getLiveDocumentPaths (Path indexPath , String projectName ) throws IOException {
267+ @ Nullable
268+ public static List <String > getLiveDocumentPaths (Path indexPath ) throws IOException {
265269 try (IndexReader indexReader = getIndexReader (indexPath )) {
266- Terms terms = MultiTerms .getTerms (indexReader , QueryBuilder .U );
267- TermsEnum uidIter = terms .iterator ();
268- String dir = "/" + projectName ;
269- String startUid = Util .path2uid (dir , "" );
270- uidIter .seekCeil (new BytesRef (startUid ));
271- final BytesRef emptyBR = new BytesRef ("" );
272- // paths of live (i.e. not deleted) documents. Must be a list so that duplicate documents can be checked.
273270 List <String > livePaths = new ArrayList <>();
274- Set <String > deletedUids = getDeletedUids (indexPath );
275271
276- while (uidIter != null && uidIter .term () != null && uidIter .term ().compareTo (emptyBR ) != 0 ) {
277- String termValue = uidIter .term ().utf8ToString ();
278- String termPath = Util .uid2url (termValue );
272+ Bits liveDocs = MultiBits .getLiveDocs (indexReader );
273+ if (liveDocs == null ) { // the index has no deletions
274+ return null ;
275+ }
279276
280- if (deletedUids .contains (termValue )) {
281- BytesRef next = uidIter .next ();
282- if (next == null ) {
283- uidIter = null ;
284- }
277+ for (int i = 0 ; i < indexReader .maxDoc (); i ++) {
278+ Document doc = indexReader .storedFields ().document (i );
279+
280+ if (!liveDocs .get (i )) {
285281 continue ;
286282 }
287283
288- livePaths . add ( termPath );
289-
290- BytesRef next = uidIter . next ();
291- if ( next == null ) {
292- uidIter = null ;
284+ // This should avoid the special LOC documents.
285+ IndexableField field = doc . getField ( QueryBuilder . U );
286+ if ( field != null ) {
287+ String uid = field . stringValue ();
288+ livePaths . add ( Util . uid2url ( uid )) ;
293289 }
294290 }
295291
296292 return livePaths ;
297293 }
298294 }
299295
300- private static void checkDuplicateDocuments (Path indexPath , String projectName )
296+ private static void checkDuplicateDocuments (Path indexPath )
301297 throws IOException , IndexDocumentException {
302298
303299 LOGGER .log (Level .FINE , "Checking duplicate documents in ''{0}''" , indexPath );
304300 Statistics stat = new Statistics ();
305- List <String > livePaths = getLiveDocumentPaths (indexPath , projectName );
301+ List <String > livePaths = getLiveDocumentPaths (indexPath );
302+ if (livePaths == null ) {
303+ throw new IndexDocumentException (String .format ("cannot determine live paths for '%s'" , indexPath ));
304+ }
306305 HashSet <String > pathSet = new HashSet <>(livePaths );
307- HashMap <String , Integer > fileMap = new HashMap <>();
306+ Map <String , Integer > fileMap = new ConcurrentHashMap <>();
308307 if (pathSet .size () != livePaths .size ()) {
309308 LOGGER .log (Level .FINE ,
310309 "index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy" ,
311310 new Object []{indexPath , pathSet .size (), livePaths .size ()});
312311 for (String path : livePaths ) {
313312 if (pathSet .contains (path )) {
314- LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
315313 fileMap .putIfAbsent (path , 0 );
316314 fileMap .put (path , fileMap .get (path ) + 1 );
317315 }
318316 }
317+ }
319318
319+ // Traverse the file map and leave only duplicate entries.
320+ for (String path : fileMap .keySet ()) {
321+ if (fileMap .get (path ) > 1 ) {
322+ LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
323+ } else {
324+ fileMap .remove (path );
325+ }
320326 }
327+
321328 stat .report (LOGGER , Level .FINE , String .format ("duplicate check in '%s' done" , indexPath ));
322329 if (!fileMap .isEmpty ()) {
323330 throw new IndexDocumentException (String .format ("index in '%s' contains duplicate live documents" ,
0 commit comments