4
4
"log"
5
5
"path/filepath"
6
6
"sort"
7
+ "strings"
7
8
"sync"
8
9
"unicode/utf8"
9
10
@@ -48,6 +49,10 @@ const (
48
49
// RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
49
50
// RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
50
51
RenameAnalysisSetSizeLimit = 1000
52
+
53
+ // RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
54
+ // to be diff-ed on byte level.
55
+ RenameAnalysisByteDiffSizeThreshold = 100000
51
56
)
52
57
53
58
// Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -367,12 +372,12 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
367
372
}
368
373
369
374
func (ra * RenameAnalysis ) blobsAreClose (blob1 * CachedBlob , blob2 * CachedBlob ) (bool , error ) {
375
+ cleanReturn := false
370
376
defer func () {
371
- if err := recover (); err != nil {
377
+ if ! cleanReturn {
372
378
log .Println ()
373
379
log .Println (blob1 .Hash .String ())
374
380
log .Println (blob2 .Hash .String ())
375
- panic (err )
376
381
}
377
382
}()
378
383
_ , err1 := blob1 .CountLines ()
@@ -382,6 +387,7 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
382
387
bsdifflen := DiffBytes (blob1 .Data , blob2 .Data )
383
388
delta := int ((int64 (bsdifflen ) * 100 ) / internal .Max64 (
384
389
internal .Min64 (blob1 .Size , blob2 .Size ), 1 ))
390
+ cleanReturn = true
385
391
return 100 - delta >= ra .SimilarityThreshold , nil
386
392
}
387
393
src , dst := string (blob1 .Data ), string (blob2 .Data )
@@ -390,72 +396,104 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
390
396
// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
391
397
// yes, this algorithm is greedy and not exact
392
398
dmp := diffmatchpatch .New ()
393
- srcLines , dstLines , lines := dmp .DiffLinesToRunes (src , dst )
394
- diffs := dmp .DiffMainRunes (srcLines , dstLines , false )
399
+ srcLineRunes , dstLineRunes , _ := dmp .DiffLinesToRunes (src , dst )
400
+ // the third returned value, []string, is the mapping from runes to lines
401
+ // we cannot use it because it is approximate and has string collisions
402
+ // that is, the mapping is wrong for huge files
403
+ diffs := dmp .DiffMainRunes (srcLineRunes , dstLineRunes , false )
404
+
405
+ srcPositions := calcLinePositions (src )
406
+ dstPositions := calcLinePositions (dst )
395
407
var common , posSrc , prevPosSrc , posDst int
396
408
possibleDelInsBlock := false
397
409
for _ , edit := range diffs {
398
410
switch edit .Type {
399
411
case diffmatchpatch .DiffDelete :
400
412
possibleDelInsBlock = true
401
413
prevPosSrc = posSrc
402
- for _ , lineno := range edit .Text {
403
- posSrc += len (lines [lineno ])
404
- }
414
+ posSrc += utf8 .RuneCountInString (edit .Text )
405
415
case diffmatchpatch .DiffInsert :
406
- nextPosDst := posDst
407
- for _ , lineno := range edit .Text {
408
- nextPosDst += len (lines [lineno ])
409
- }
416
+ nextPosDst := posDst + utf8 .RuneCountInString (edit .Text )
410
417
if possibleDelInsBlock {
411
418
possibleDelInsBlock = false
412
- localDmp := diffmatchpatch .New ()
413
- localSrc := src [prevPosSrc :posSrc ]
414
- localDst := dst [posDst :nextPosDst ]
415
- localDiffs := localDmp .DiffMainRunes ([]rune (localSrc ), []rune (localDst ), false )
416
- for _ , localEdit := range localDiffs {
417
- if localEdit .Type == diffmatchpatch .DiffEqual {
418
- common += utf8 .RuneCountInString (localEdit .Text )
419
+ if internal .Max (srcPositions [posSrc ]- srcPositions [prevPosSrc ],
420
+ dstPositions [nextPosDst ]- dstPositions [posDst ]) < RenameAnalysisByteDiffSizeThreshold {
421
+ localDmp := diffmatchpatch .New ()
422
+ localSrc := src [srcPositions [prevPosSrc ]:srcPositions [posSrc ]]
423
+ localDst := dst [dstPositions [posDst ]:dstPositions [nextPosDst ]]
424
+ localDiffs := localDmp .DiffMainRunes (
425
+ strToLiteralRunes (localSrc ), strToLiteralRunes (localDst ), false )
426
+ for _ , localEdit := range localDiffs {
427
+ if localEdit .Type == diffmatchpatch .DiffEqual {
428
+ common += utf8 .RuneCountInString (localEdit .Text )
429
+ }
419
430
}
420
431
}
421
432
}
422
433
posDst = nextPosDst
423
434
case diffmatchpatch .DiffEqual :
424
435
possibleDelInsBlock = false
425
- for _ , lineno := range edit .Text {
426
- common += utf8 . RuneCountInString ( lines [ lineno ])
427
- step := len ( lines [ lineno ])
428
- posSrc += step
429
- posDst += step
436
+ step := utf8 . RuneCountInString ( edit .Text )
437
+ // for i := range edit.Text does *not* work
438
+ // idk why, but `i` appears to be bigger than the number of runes
439
+ for i := 0 ; i < step ; i ++ {
440
+ common += srcPositions [ posSrc + i + 1 ] - srcPositions [ posSrc + i ]
430
441
}
442
+ posSrc += step
443
+ posDst += step
431
444
}
432
445
if possibleDelInsBlock {
433
446
continue
434
447
}
435
448
// supposing that the rest of the lines are the same (they are not - too optimistic),
436
449
// estimate the maximum similarity and exit the loop if it lower than our threshold
437
450
var srcPendingSize , dstPendingSize int
438
- if posSrc < len (src ) {
439
- srcPendingSize = utf8 .RuneCountInString (src [posSrc :])
440
- }
441
- if posDst < len (dst ) {
442
- dstPendingSize = utf8 .RuneCountInString (dst [posDst :])
443
- }
451
+ srcPendingSize = len (src ) - srcPositions [posSrc ]
452
+ dstPendingSize = len (dst ) - dstPositions [posDst ]
444
453
maxCommon := common + internal .Min (srcPendingSize , dstPendingSize )
445
454
similarity := (maxCommon * 100 ) / maxSize
446
455
if similarity < ra .SimilarityThreshold {
456
+ cleanReturn = true
447
457
return false , nil
448
458
}
449
459
similarity = (common * 100 ) / maxSize
450
460
if similarity >= ra .SimilarityThreshold {
461
+ cleanReturn = true
451
462
return true , nil
452
463
}
453
464
}
454
465
// the very last "overly optimistic" estimate was actually precise, so since we are still here
455
466
// the blobs are similar
467
+ cleanReturn = true
456
468
return true , nil
457
469
}
458
470
471
+ func calcLinePositions (text string ) []int {
472
+ if text == "" {
473
+ return []int {0 }
474
+ }
475
+ lines := strings .Split (text , "\n " )
476
+ positions := make ([]int , len (lines )+ 1 )
477
+ accum := 0
478
+ for i , l := range lines {
479
+ positions [i ] = accum
480
+ accum += len (l ) + 1 // +1 for \n
481
+ }
482
+ if len (lines ) > 0 && lines [len (lines )- 1 ] != "\n " {
483
+ accum --
484
+ }
485
+ positions [len (lines )] = accum
486
+ return positions
487
+ }
488
+
489
+ func strToLiteralRunes (s string ) []rune {
490
+ lrunes := make ([]rune , len (s ))
491
+ for i , b := range []byte (s ) {
492
+ lrunes [i ] = rune (b )
493
+ }
494
+ return lrunes
495
+ }
496
+
459
497
type sortableChange struct {
460
498
change * object.Change
461
499
hash plumbing.Hash
0 commit comments