Skip to content

Commit c1002f4

Browse files
authored
Merge pull request #195 from vmarkovtsev/master
Fix renames once again
2 parents 146e39f + f90c235 commit c1002f4

11 files changed

+140
-40
lines changed

internal/core/forks.go

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ const (
8989

9090
// planPrintFunc is used to print the execution plan in prepareRunPlan().
9191
var planPrintFunc = func(args ...interface{}) {
92+
fmt.Fprintln(os.Stderr)
9293
fmt.Fprintln(os.Stderr, args...)
9394
}
9495

internal/core/pipeline.go

-1
Original file line numberDiff line numberDiff line change
@@ -757,7 +757,6 @@ func (pipeline *Pipeline) Run(commits []*object.Commit) (map[LeafPipelineItem]in
757757
continue
758758
}
759759
if pipeline.PrintActions {
760-
fmt.Fprintln(os.Stderr)
761760
printAction(step)
762761
}
763762
if index > 0 && index%100 == 0 && pipeline.HibernationDistance > 0 {

internal/core/pipeline_test.go

+4
Original file line numberDiff line numberDiff line change
@@ -537,9 +537,13 @@ func TestPipelineDumpPlanConfigure(t *testing.T) {
537537
pipeline.Initialize(map[string]interface{}{ConfigPipelineDumpPlan: true})
538538
assert.True(t, pipeline.DumpPlan)
539539
stream := &bytes.Buffer{}
540+
backupPlanPrintFunc := planPrintFunc
540541
planPrintFunc = func(args ...interface{}) {
541542
fmt.Fprintln(stream, args...)
542543
}
544+
defer func() {
545+
planPrintFunc = backupPlanPrintFunc
546+
}()
543547
commits := make([]*object.Commit, 1)
544548
commits[0], _ = test.Repository.CommitObject(plumbing.NewHash(
545549
"af9ddc0db70f09f3f27b4b98e415592a7485171c"))

internal/plumbing/day.go

+11-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package plumbing
22

33
import (
4+
"log"
45
"time"
56

67
"gopkg.in/src-d/go-git.v4"
@@ -13,6 +14,7 @@ import (
1314
// It is a PipelineItem.
1415
type DaysSinceStart struct {
1516
core.NoopMerger
17+
remote string
1618
day0 *time.Time
1719
previousDay int
1820
commits map[int][]plumbing.Hash
@@ -75,6 +77,9 @@ func (days *DaysSinceStart) Initialize(repository *git.Repository) error {
7577
delete(days.commits, key)
7678
}
7779
}
80+
if r, err := repository.Remotes(); err == nil && len(r) > 0 {
81+
days.remote = r[0].Config().URLs[0]
82+
}
7883
return nil
7984
}
8085

@@ -88,9 +93,13 @@ func (days *DaysSinceStart) Consume(deps map[string]interface{}) (map[string]int
8893
index := deps[core.DependencyIndex].(int)
8994
if index == 0 {
9095
// first iteration - initialize the file objects from the tree
91-
*days.day0 = commit.Committer.When
9296
// our precision is 1 day
93-
*days.day0 = days.day0.Truncate(24 * time.Hour)
97+
*days.day0 = commit.Committer.When.Truncate(24 * time.Hour)
98+
if days.day0.Unix() < 631152000 { // 01.01.1990, that was 30 years ago
99+
log.Println()
100+
log.Printf("Warning: suspicious committer timestamp in %s > %s",
101+
days.remote, commit.Hash.String())
102+
}
94103
}
95104
day := int(commit.Committer.When.Sub(*days.day0).Hours() / 24)
96105
if day < days.previousDay {

internal/plumbing/day_test.go

+31
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
package plumbing
22

33
import (
4+
"bytes"
5+
"log"
6+
"os"
47
"testing"
8+
"time"
59

610
"github.com/stretchr/testify/assert"
711
"gopkg.in/src-d/go-git.v4/plumbing"
@@ -123,3 +127,30 @@ func TestDaysSinceStartFork(t *testing.T) {
123127
// just for the sake of it
124128
dss1.Merge([]core.PipelineItem{dss2})
125129
}
130+
131+
func TestDaysSinceStartConsumeZero(t *testing.T) {
132+
dss := fixtureDaysSinceStart()
133+
deps := map[string]interface{}{}
134+
commit, _ := test.Repository.CommitObject(plumbing.NewHash(
135+
"cce947b98a050c6d356bc6ba95030254914027b1"))
136+
commit.Committer.When = time.Unix(0, 0)
137+
deps[core.DependencyCommit] = commit
138+
deps[core.DependencyIndex] = 0
139+
// print warning to log
140+
myOutput := &bytes.Buffer{}
141+
log.SetOutput(myOutput)
142+
defer func() {
143+
log.SetOutput(os.Stderr)
144+
}()
145+
res, err := dss.Consume(deps)
146+
assert.Nil(t, err)
147+
assert.Contains(t, myOutput.String(), "Warning")
148+
assert.Contains(t, myOutput.String(), "cce947b98a050c6d356bc6ba95030254914027b1")
149+
assert.Contains(t, myOutput.String(), "hercules")
150+
assert.Contains(t, myOutput.String(), "github.com")
151+
assert.Equal(t, res[DependencyDay].(int), 0)
152+
assert.Equal(t, dss.previousDay, 0)
153+
assert.Equal(t, dss.day0.Year(), 1970)
154+
assert.Equal(t, dss.day0.Minute(), 0)
155+
assert.Equal(t, dss.day0.Second(), 0)
156+
}

internal/plumbing/renames.go

+67-29
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"log"
55
"path/filepath"
66
"sort"
7+
"strings"
78
"sync"
89
"unicode/utf8"
910

@@ -48,6 +49,10 @@ const (
4849
// RenameAnalysisSetSizeLimit is the maximum number of added + removed files for
4950
// RenameAnalysisMaxCandidates to be active; the bigger numbers set it to 1.
5051
RenameAnalysisSetSizeLimit = 1000
52+
53+
// RenameAnalysisByteDiffSizeThreshold is the maximum size of each of the compared parts
54+
// to be diff-ed on byte level.
55+
RenameAnalysisByteDiffSizeThreshold = 100000
5156
)
5257

5358
// Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
@@ -367,12 +372,12 @@ func (ra *RenameAnalysis) sizesAreClose(size1 int64, size2 int64) bool {
367372
}
368373

369374
func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (bool, error) {
375+
cleanReturn := false
370376
defer func() {
371-
if err := recover(); err != nil {
377+
if !cleanReturn {
372378
log.Println()
373379
log.Println(blob1.Hash.String())
374380
log.Println(blob2.Hash.String())
375-
panic(err)
376381
}
377382
}()
378383
_, err1 := blob1.CountLines()
@@ -382,6 +387,7 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
382387
bsdifflen := DiffBytes(blob1.Data, blob2.Data)
383388
delta := int((int64(bsdifflen) * 100) / internal.Max64(
384389
internal.Min64(blob1.Size, blob2.Size), 1))
390+
cleanReturn = true
385391
return 100-delta >= ra.SimilarityThreshold, nil
386392
}
387393
src, dst := string(blob1.Data), string(blob2.Data)
@@ -390,72 +396,104 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
390396
// compute the line-by-line diff, then the char-level diffs of the del-ins blocks
391397
// yes, this algorithm is greedy and not exact
392398
dmp := diffmatchpatch.New()
393-
srcLines, dstLines, lines := dmp.DiffLinesToRunes(src, dst)
394-
diffs := dmp.DiffMainRunes(srcLines, dstLines, false)
399+
srcLineRunes, dstLineRunes, _ := dmp.DiffLinesToRunes(src, dst)
400+
// the third returned value, []string, is the mapping from runes to lines
401+
// we cannot use it because it is approximate and has string collisions
402+
// that is, the mapping is wrong for huge files
403+
diffs := dmp.DiffMainRunes(srcLineRunes, dstLineRunes, false)
404+
405+
srcPositions := calcLinePositions(src)
406+
dstPositions := calcLinePositions(dst)
395407
var common, posSrc, prevPosSrc, posDst int
396408
possibleDelInsBlock := false
397409
for _, edit := range diffs {
398410
switch edit.Type {
399411
case diffmatchpatch.DiffDelete:
400412
possibleDelInsBlock = true
401413
prevPosSrc = posSrc
402-
for _, lineno := range edit.Text {
403-
posSrc += len(lines[lineno])
404-
}
414+
posSrc += utf8.RuneCountInString(edit.Text)
405415
case diffmatchpatch.DiffInsert:
406-
nextPosDst := posDst
407-
for _, lineno := range edit.Text {
408-
nextPosDst += len(lines[lineno])
409-
}
416+
nextPosDst := posDst + utf8.RuneCountInString(edit.Text)
410417
if possibleDelInsBlock {
411418
possibleDelInsBlock = false
412-
localDmp := diffmatchpatch.New()
413-
localSrc := src[prevPosSrc:posSrc]
414-
localDst := dst[posDst:nextPosDst]
415-
localDiffs := localDmp.DiffMainRunes([]rune(localSrc), []rune(localDst), false)
416-
for _, localEdit := range localDiffs {
417-
if localEdit.Type == diffmatchpatch.DiffEqual {
418-
common += utf8.RuneCountInString(localEdit.Text)
419+
if internal.Max(srcPositions[posSrc]-srcPositions[prevPosSrc],
420+
dstPositions[nextPosDst]-dstPositions[posDst]) < RenameAnalysisByteDiffSizeThreshold {
421+
localDmp := diffmatchpatch.New()
422+
localSrc := src[srcPositions[prevPosSrc]:srcPositions[posSrc]]
423+
localDst := dst[dstPositions[posDst]:dstPositions[nextPosDst]]
424+
localDiffs := localDmp.DiffMainRunes(
425+
strToLiteralRunes(localSrc), strToLiteralRunes(localDst), false)
426+
for _, localEdit := range localDiffs {
427+
if localEdit.Type == diffmatchpatch.DiffEqual {
428+
common += utf8.RuneCountInString(localEdit.Text)
429+
}
419430
}
420431
}
421432
}
422433
posDst = nextPosDst
423434
case diffmatchpatch.DiffEqual:
424435
possibleDelInsBlock = false
425-
for _, lineno := range edit.Text {
426-
common += utf8.RuneCountInString(lines[lineno])
427-
step := len(lines[lineno])
428-
posSrc += step
429-
posDst += step
436+
step := utf8.RuneCountInString(edit.Text)
437+
// for i := range edit.Text does *not* work
438+
// idk why, but `i` appears to be bigger than the number of runes
439+
for i := 0; i < step; i++ {
440+
common += srcPositions[posSrc+i+1] - srcPositions[posSrc+i]
430441
}
442+
posSrc += step
443+
posDst += step
431444
}
432445
if possibleDelInsBlock {
433446
continue
434447
}
435448
// supposing that the rest of the lines are the same (they are not - too optimistic),
436449
// estimate the maximum similarity and exit the loop if it lower than our threshold
437450
var srcPendingSize, dstPendingSize int
438-
if posSrc < len(src) {
439-
srcPendingSize = utf8.RuneCountInString(src[posSrc:])
440-
}
441-
if posDst < len(dst) {
442-
dstPendingSize = utf8.RuneCountInString(dst[posDst:])
443-
}
451+
srcPendingSize = len(src) - srcPositions[posSrc]
452+
dstPendingSize = len(dst) - dstPositions[posDst]
444453
maxCommon := common + internal.Min(srcPendingSize, dstPendingSize)
445454
similarity := (maxCommon * 100) / maxSize
446455
if similarity < ra.SimilarityThreshold {
456+
cleanReturn = true
447457
return false, nil
448458
}
449459
similarity = (common * 100) / maxSize
450460
if similarity >= ra.SimilarityThreshold {
461+
cleanReturn = true
451462
return true, nil
452463
}
453464
}
454465
// the very last "overly optimistic" estimate was actually precise, so since we are still here
455466
// the blobs are similar
467+
cleanReturn = true
456468
return true, nil
457469
}
458470

471+
func calcLinePositions(text string) []int {
472+
if text == "" {
473+
return []int{0}
474+
}
475+
lines := strings.Split(text, "\n")
476+
positions := make([]int, len(lines)+1)
477+
accum := 0
478+
for i, l := range lines {
479+
positions[i] = accum
480+
accum += len(l) + 1 // +1 for \n
481+
}
482+
if len(lines) > 0 && lines[len(lines)-1] != "\n" {
483+
accum--
484+
}
485+
positions[len(lines)] = accum
486+
return positions
487+
}
488+
489+
func strToLiteralRunes(s string) []rune {
490+
lrunes := make([]rune, len(s))
491+
for i, b := range []byte(s) {
492+
lrunes[i] = rune(b)
493+
}
494+
return lrunes
495+
}
496+
459497
type sortableChange struct {
460498
change *object.Change
461499
hash plumbing.Hash

internal/plumbing/renames_test.go

+23-5
Original file line numberDiff line numberDiff line change
@@ -264,20 +264,25 @@ func TestBlobsAreCloseBinary(t *testing.T) {
264264
assert.False(t, result)
265265
}
266266

267-
func TestBlobsAreCloseBug(t *testing.T) {
268-
gzsource, err := os.Open(path.Join("..", "test_data", "rename_bug.xml.gz"))
267+
func loadData(t *testing.T, name string) []byte {
268+
gzsource, err := os.Open(path.Join("..", "test_data", name))
269269
defer gzsource.Close()
270270
if err != nil {
271-
t.Errorf("open ../test_data/rename_bug.xml.gz: %v", err)
271+
t.Errorf("open ../test_data/%s: %v", name, err)
272272
}
273273
gzreader, err := gzip.NewReader(gzsource)
274274
if err != nil {
275-
t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
275+
t.Errorf("gzip ../test_data/%s: %v", name, err)
276276
}
277277
data, err := ioutil.ReadAll(gzreader)
278278
if err != nil {
279-
t.Errorf("gzip ../test_data/rename_bug.xml.gz: %v", err)
279+
t.Errorf("gzip ../test_data/%s: %v", name, err)
280280
}
281+
return data
282+
}
283+
284+
func TestBlobsAreCloseBug1(t *testing.T) {
285+
data := loadData(t, "rename_bug1.xml.gz")
281286
blob1 := &CachedBlob{Data: data}
282287
blob2 := &CachedBlob{Data: data}
283288
blob1.Size = int64(len(data))
@@ -287,3 +292,16 @@ func TestBlobsAreCloseBug(t *testing.T) {
287292
assert.Nil(t, err)
288293
assert.True(t, result)
289294
}
295+
296+
func TestBlobsAreCloseBug2(t *testing.T) {
297+
data1 := loadData(t, "rename_bug2.base.gz")
298+
data2 := loadData(t, "rename_bug2.head.gz")
299+
blob1 := &CachedBlob{Data: data1}
300+
blob2 := &CachedBlob{Data: data2}
301+
blob1.Size = int64(len(data1))
302+
blob2.Size = int64(len(data2))
303+
ra := fixtureRenameAnalysis()
304+
result, err := ra.blobsAreClose(blob1, blob2)
305+
assert.Nil(t, err)
306+
assert.False(t, result)
307+
}

internal/test/repository.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@ package test
22

33
import (
44
"io"
5+
"io/ioutil"
56
"os"
7+
"path"
68

7-
git "gopkg.in/src-d/go-git.v4"
9+
"gopkg.in/src-d/go-git.v4"
810
"gopkg.in/src-d/go-git.v4/plumbing"
911
"gopkg.in/src-d/go-git.v4/plumbing/object"
1012
"gopkg.in/src-d/go-git.v4/storage/memory"
11-
"io/ioutil"
12-
"path"
1313
)
1414

1515
// Repository is a boilerplate sample repository (Hercules itself).
File renamed without changes.
125 KB
Binary file not shown.
111 KB
Binary file not shown.

0 commit comments

Comments
 (0)