1818 */
1919
2020/*
21- * Copyright (c) 2018, 2023 , Oracle and/or its affiliates. All rights reserved.
21+ * Copyright (c) 2018, 2024 , Oracle and/or its affiliates. All rights reserved.
2222 * Portions Copyright (c) 2018, Chris Fraire <cfraire@me.com>.
2323 */
2424package org .opengrok .indexer .index ;
3838import java .util .Map ;
3939import java .util .Objects ;
4040import java .util .Set ;
41+ import java .util .TreeSet ;
4142import java .util .concurrent .ConcurrentHashMap ;
4243import java .util .concurrent .ExecutionException ;
4344import java .util .concurrent .ExecutorService ;
6364import org .apache .lucene .util .Bits ;
6465import org .apache .lucene .util .Version ;
6566import org .jetbrains .annotations .NotNull ;
66- import org .jetbrains .annotations .Nullable ;
6767import org .jetbrains .annotations .VisibleForTesting ;
6868import org .opengrok .indexer .analysis .Definitions ;
6969import org .opengrok .indexer .configuration .Configuration ;
@@ -232,7 +232,9 @@ public void check(IndexCheckMode mode) throws IOException, IndexCheckException {
232232 /**
233233 * Perform specified check on given index directory. All exceptions except {@code IOException} are swallowed
234234 * and result in return value of 1.
235+ * @param sourcePath source root path
235236 * @param indexPath directory with index
237+ * @param mode index check mode
236238 * @throws IOException on I/O error
237239 * @throws IndexCheckException if the index failed given check
238240 */
@@ -270,7 +272,7 @@ void checkDir(Path sourcePath, Path indexPath, IndexCheckMode mode)
270272 checkVersion (sourcePath , indexPath );
271273 break ;
272274 case DOCUMENTS :
273- checkDuplicateDocuments (sourcePath , indexPath );
275+ checkDocuments (sourcePath , indexPath );
274276 break ;
275277 case DEFINITIONS :
276278 checkDefinitions (sourcePath , indexPath );
@@ -410,14 +412,14 @@ private void checkDefinitions(Path sourcePath, Path indexPath) throws IOExceptio
410412 errors ++;
411413 }
412414 } catch (Exception e ) {
413- LOGGER .log (Level .WARNING , "failure when checking definitions" , e );
415+ LOGGER .log (Level .WARNING , String . format ( "failure when checking definitions for '%s'" , indexPath ) , e );
414416 final Throwable cause = e .getCause ();
415417 if (cause instanceof IOException ) {
416418 ioException = (IOException ) cause ;
417419 }
418420 }
419421 }
420- statistics .report (LOGGER , Level .FINE , String .format ("checked %d files" , paths .size ()));
422+ statistics .report (LOGGER , Level .FINE , String .format ("checked %d files for '%s' " , paths .size (), indexPath ));
421423
422424 // If there were multiple cases of IOException, they were logged above.
423425 // Propagate the last one so that upper layers can properly decide on how to treat the index check.
@@ -426,17 +428,17 @@ private void checkDefinitions(Path sourcePath, Path indexPath) throws IOExceptio
426428 }
427429
428430 if (errors > 0 ) {
429- throw new IndexDocumentException (String .format ("document check failed for (%d documents out of %d)" ,
430- errors , paths .size ()), indexPath );
431+ throw new IndexDocumentException (String .format ("definitions check failed for '%s' (%d documents out of %d)" ,
432+ indexPath , errors , paths .size ()), sourcePath );
431433 }
432434 }
433435
434436 /**
435- * @param sourcePath path to the source
436- * @param indexPath path to the index directory
437- * @throws IOException on I/O error
437+ * @param sourcePath source path
438+ * @param indexPath path to the index directory
439+ * @throws IOException on I/O error
438440 * @throws IndexVersionException if the version stored in the document does not match the version
439- * used by the running program
441+ * used by the running program
440442 */
441443 private void checkVersion (Path sourcePath , Path indexPath ) throws IOException , IndexVersionException {
442444 LockFactory lockFactory = NativeFSLockFactory .INSTANCE ;
@@ -456,7 +458,7 @@ private void checkVersion(Path sourcePath, Path indexPath) throws IOException, I
456458 new Object []{indexPath , segVersion , Version .LATEST .major });
457459 if (segVersion != Version .LATEST .major ) {
458460 throw new IndexVersionException (
459- String .format ("Index for '%s' has index version discrepancy" , sourcePath ), sourcePath ,
461+ String .format ("Index in '%s' has index version discrepancy" , indexPath ), sourcePath ,
460462 Version .LATEST .major , segVersion );
461463 }
462464 }
@@ -506,72 +508,88 @@ static Set<String> getDeletedUids(Path indexPath) throws IOException {
506508 * or {@code null} if live documents cannot be retrieved.
507509 * @throws IOException on I/O error
508510 */
509- @ Nullable
510511 @ VisibleForTesting
511- static List <String > getLiveDocumentPaths (Path indexPath ) throws IOException {
512+ static List <Path > getLiveDocumentPaths (Path indexPath ) throws IOException {
512513 try (IndexReader indexReader = getIndexReader (indexPath )) {
513- List <String > livePaths = new ArrayList <>();
514+ List <Path > livePaths = new ArrayList <>();
514515
515516 Bits liveDocs = MultiBits .getLiveDocs (indexReader );
516- if (liveDocs == null ) { // the index has no deletions
517- return null ;
518- }
519517
520518 for (int i = 0 ; i < indexReader .maxDoc (); i ++) {
521519 Document doc = indexReader .storedFields ().document (i );
522520
523- if (!liveDocs .get (i )) {
521+ // liveDocs is null if the index has no deletions.
522+ if (liveDocs != null && !liveDocs .get (i )) {
524523 continue ;
525524 }
526525
527526 // This should avoid the special LOC documents.
528527 IndexableField field = doc .getField (QueryBuilder .U );
529528 if (field != null ) {
530529 String uid = field .stringValue ();
531- livePaths .add (Util .uid2url (uid ));
530+ livePaths .add (Path . of ( Util .uid2url (uid ) ));
532531 }
533532 }
534533
535534 return livePaths ;
536535 }
537536 }
538537
539- private static void checkDuplicateDocuments (Path sourcePath , Path indexPath ) throws IOException , IndexDocumentException {
538+ /**
539+ * Check live (not deleted) documents in the index whether they have the following properties.
540+ * <ul>
541+ * <li>they have corresponding file under source root</li>
542+ * <li>there is exactly one document with the same path</li>
543+ * </ul>
544+ * @param sourcePath source root path
545+ * @param indexPath index path
546+ * @throws IOException on I/O error
547+ * @throws IndexDocumentException if the index failed the check
548+ */
549+ private void checkDocuments (Path sourcePath , Path indexPath ) throws IOException , IndexDocumentException {
540550
541- LOGGER .log (Level .FINE , "Checking duplicate documents in ''{0}''" , indexPath );
542551 Statistics stat = new Statistics ();
543- List <String > livePaths = getLiveDocumentPaths (indexPath );
544- if (livePaths == null ) {
545- throw new IndexDocumentException (String .format ("cannot determine live paths for '%s'" , indexPath ),
546- indexPath );
552+ List <Path > livePaths = getLiveDocumentPaths (indexPath );
553+
554+ LOGGER .log (Level .FINE , "checking documents in ''{0}}'' have corresponding file under source root ''{1}''" ,
555+ new Object []{indexPath , sourcePath });
556+ Set <Path > missingPaths = new TreeSet <>();
557+ for (Path relativePath : livePaths ) {
558+ Path absolutePath = Path .of (configuration .getSourceRoot (), relativePath .toString ());
559+ if (!Files .exists (absolutePath )) {
560+ LOGGER .log (Level .FINER , "path ''{0}'' does not exist" , absolutePath );
561+ missingPaths .add (absolutePath );
562+ }
547563 }
548- HashSet <String > pathSet = new HashSet <>(livePaths );
549- Map <String , Integer > fileMap = new ConcurrentHashMap <>();
564+
565+ LOGGER .log (Level .FINE , "Checking duplicate documents in ''{0}''" , indexPath );
566+ HashSet <Path > pathSet = new HashSet <>(livePaths );
567+ Map <Path , Integer > duplicatePathMap = new ConcurrentHashMap <>();
550568 if (pathSet .size () != livePaths .size ()) {
551569 LOGGER .log (Level .FINE ,
552570 "index in ''{0}'' has document path set ({1}) vs document list ({2}) discrepancy" ,
553571 new Object []{indexPath , pathSet .size (), livePaths .size ()});
554- for (String path : livePaths ) {
572+ for (Path path : livePaths ) {
555573 if (pathSet .contains (path )) {
556- fileMap .putIfAbsent (path , 0 );
557- fileMap .put (path , fileMap .get (path ) + 1 );
574+ duplicatePathMap .putIfAbsent (path , 0 );
575+ duplicatePathMap .put (path , duplicatePathMap .get (path ) + 1 );
558576 }
559577 }
560578 }
561579
562580 // Traverse the file map and leave only duplicate entries.
563- for (String path : fileMap .keySet ()) {
564- if (fileMap .get (path ) > 1 ) {
581+ for (Path path : duplicatePathMap .keySet ()) {
582+ if (duplicatePathMap .get (path ) > 1 ) {
565583 LOGGER .log (Level .FINER , "duplicate path: ''{0}''" , path );
566584 } else {
567- fileMap .remove (path );
585+ duplicatePathMap .remove (path );
568586 }
569587 }
570588
571- stat .report (LOGGER , Level .FINE , String .format ("duplicate check in '%s' done" , indexPath ));
572- if (!fileMap .isEmpty ()) {
573- throw new IndexDocumentException (String .format ("index for '%s' contains duplicate live documents " ,
574- sourcePath ), sourcePath , fileMap );
589+ stat .report (LOGGER , Level .FINE , String .format ("document check in '%s' done" , indexPath ));
590+ if (!duplicatePathMap . isEmpty () || ! missingPaths .isEmpty ()) {
591+ throw new IndexDocumentException (String .format ("index '%s' failed document check " ,
592+ indexPath ), sourcePath , duplicatePathMap , missingPaths );
575593 }
576594 }
577595}
0 commit comments