Skip to content

Commit e03f209

Browse files
authored
Merge pull request #264 from vmarkovtsev/master
Fix merging identities
2 parents ace56f7 + 568345e commit e03f209

File tree

7 files changed

+283
-108
lines changed

7 files changed

+283
-108
lines changed

.travis.yml

+4-5
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ go_import_path: gopkg.in/src-d/hercules.v10
99
go:
1010
- 1.10.x
1111
- 1.11.x
12+
- 1.12.x
1213

1314
services:
1415
- docker
@@ -26,8 +27,6 @@ before_cache:
2627

2728
matrix:
2829
fast_finish: true
29-
allow_failures:
30-
- go: tip
3130

3231
stages:
3332
- test
@@ -60,7 +59,7 @@ install:
6059
- travis_retry make TAGS=tensorflow
6160
script:
6261
- set -e
63-
- if [ $TRAVIS_GO_VERSION == "1.11.x" ]; then test -z "$(gofmt -s -l . | grep -v vendor/)"; fi
62+
- if [ $TRAVIS_GO_VERSION != "1.10.x" ]; then test -z "$(gofmt -s -l . | grep -v vendor/)"; fi
6463
- go vet -tags tensorflow ./...
6564
- golint -set_exit_status $(go list ./... | grep -v /vendor/)
6665
- cd python && flake8 && cd ..
@@ -90,7 +89,7 @@ jobs:
9089
- stage: deploy
9190
os: osx
9291
osx_image: xcode9.3
93-
go: 1.11.x
92+
go: 1.12.x
9493
go_import_path: gopkg.in/src-d/hercules.v10
9594
before_install:
9695
- wget -O protoc.zip https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/protoc-$PROTOC_VERSION-osx-x86_64.zip
@@ -112,7 +111,7 @@ jobs:
112111
tags: true
113112
- stage: deploy
114113
os: linux
115-
go: 1.11.x
114+
go: 1.12.x
116115
go_import_path: gopkg.in/src-d/hercules.v10
117116
before_install:
118117
- wget -O protoc.zip https://github.com/google/protobuf/releases/download/v$PROTOC_VERSION/protoc-$PROTOC_VERSION-linux-x86_64.zip

internal/plumbing/identity/identity.go

+175-16
Original file line numberDiff line numberDiff line change
@@ -265,33 +265,192 @@ func (detector *Detector) GeneratePeopleDict(commits []*object.Commit) {
265265
detector.ReversedPeopleDict = reverseDict
266266
}
267267

268-
// MergeReversedDicts joins two identity lists together, excluding duplicates, in-order.
269-
// The returned mapping's values are: final index, index in first, index in second (-1 for a miss).
270-
func (detector Detector) MergeReversedDicts(rd1, rd2 []string) (map[string][3]int, []string) {
271-
people := map[string][3]int{}
268+
// MergedIndex is the result of merging `rd1[First]` and `rd2[Second]`: the index in the final reversed
269+
// dictionary. -1 for `First` or `Second` means that the corresponding string does not exist
270+
// in respectively `rd1` and `rd2`.
271+
// See also:
272+
// * MergeReversedDictsLiteral()
273+
// * MergeReversedDictsIdentities()
274+
type MergedIndex struct {
275+
Final int
276+
First int
277+
Second int
278+
}
279+
280+
// MergeReversedDictsLiteral joins two string lists together, excluding duplicates, in-order.
281+
// The string comparisons are the usual ones.
282+
// The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
283+
// 1. Index after merging.
284+
// 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
285+
// 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
286+
func MergeReversedDictsLiteral(rd1, rd2 []string) (map[string]MergedIndex, []string) {
287+
288+
people := map[string]MergedIndex{}
272289
for i, pid := range rd1 {
273-
ptrs := people[pid]
274-
ptrs[0] = len(people)
275-
ptrs[1] = i
276-
ptrs[2] = -1
277-
people[pid] = ptrs
290+
people[pid] = MergedIndex{len(people), i, -1}
278291
}
279292
for i, pid := range rd2 {
280-
ptrs, exists := people[pid]
281-
if !exists {
282-
ptrs[0] = len(people)
283-
ptrs[1] = -1
293+
if ptrs, exists := people[pid]; !exists {
294+
people[pid] = MergedIndex{len(people), -1, i}
295+
} else {
296+
people[pid] = MergedIndex{ptrs.Final, ptrs.First, i}
284297
}
285-
ptrs[2] = i
286-
people[pid] = ptrs
287298
}
288299
mrd := make([]string, len(people))
289300
for name, ptrs := range people {
290-
mrd[ptrs[0]] = name
301+
mrd[ptrs.Final] = name
291302
}
292303
return people, mrd
293304
}
294305

306+
type identityPair struct {
307+
Index1 int
308+
Index2 int
309+
}
310+
311+
// MergeReversedDictsIdentities joins two identity lists together, excluding duplicates.
312+
// The strings are split by "|" and we find the connected components..
313+
// The returned mapping's keys are the unique strings in `rd1 ∪ rd2`, and the values are:
314+
// 1. Index after merging.
315+
// 2. Corresponding index in the first array - `rd1`. -1 means that it does not exist.
316+
// 3. Corresponding index in the second array - `rd2`. -1 means that it does not exist.
317+
func MergeReversedDictsIdentities(rd1, rd2 []string) (map[string]MergedIndex, []string) {
318+
319+
vocabulary := map[string]identityPair{}
320+
vertices1 := make([][]string, len(rd1))
321+
for i, s := range rd1 {
322+
parts := strings.Split(s, "|")
323+
vertices1[i] = parts
324+
for _, p := range parts {
325+
vocabulary[p] = identityPair{i, -1}
326+
}
327+
}
328+
vertices2 := make([][]string, len(rd2))
329+
for i, s := range rd2 {
330+
parts := strings.Split(s, "|")
331+
vertices2[i] = parts
332+
for _, p := range parts {
333+
if ip, exists := vocabulary[p]; !exists {
334+
vocabulary[p] = identityPair{-1, i}
335+
} else {
336+
ip.Index2 = i
337+
vocabulary[p] = ip
338+
}
339+
}
340+
}
341+
342+
// find the connected components by walking the graph
343+
var walks []map[string]bool
344+
visited := map[string]bool{}
345+
346+
walkFromVertex := func(root []string) {
347+
walk := map[string]bool{}
348+
pending := map[string]bool{}
349+
for _, p := range root {
350+
pending[p] = true
351+
}
352+
for len(pending) > 0 {
353+
var element string
354+
for e := range pending {
355+
element = e
356+
delete(pending, e)
357+
break
358+
}
359+
if !walk[element] {
360+
walk[element] = true
361+
ip := vocabulary[element]
362+
if ip.Index1 >= 0 {
363+
for _, p := range vertices1[ip.Index1] {
364+
if !walk[p] {
365+
pending[p] = true
366+
}
367+
}
368+
}
369+
if ip.Index2 >= 0 {
370+
for _, p := range vertices2[ip.Index2] {
371+
if !walk[p] {
372+
pending[p] = true
373+
}
374+
}
375+
}
376+
}
377+
}
378+
for e := range walk {
379+
visited[e] = true
380+
}
381+
walks = append(walks, walk)
382+
}
383+
384+
for i1 := range rd1 {
385+
var skip bool
386+
for _, p := range vertices1[i1] {
387+
if visited[p] {
388+
skip = true
389+
break
390+
}
391+
}
392+
if skip {
393+
continue
394+
}
395+
walkFromVertex(vertices1[i1])
396+
}
397+
for i2 := range rd2 {
398+
var skip bool
399+
for _, p := range vertices2[i2] {
400+
if visited[p] {
401+
skip = true
402+
break
403+
}
404+
}
405+
if skip {
406+
continue
407+
}
408+
walkFromVertex(vertices2[i2])
409+
}
410+
411+
mergedStrings := make([]string, 0, len(walks))
412+
mergedIndex := map[string]MergedIndex{}
413+
// convert each walk from strings to indexes
414+
for walkIndex, walk := range walks {
415+
ids := make([]string, 0, len(walk))
416+
for key := range walk {
417+
ids = append(ids, key)
418+
}
419+
// place emails after names
420+
sort.Slice(ids, func(i, j int) bool {
421+
iid := ids[i]
422+
jid := ids[j]
423+
iHasAt := strings.ContainsRune(iid, '@')
424+
jHasAt := strings.ContainsRune(jid, '@')
425+
if iHasAt == jHasAt {
426+
return iid < jid
427+
}
428+
return jHasAt
429+
})
430+
mergedStrings = append(mergedStrings, strings.Join(ids, "|"))
431+
for _, key := range ids {
432+
ipair := vocabulary[key]
433+
if ipair.Index1 >= 0 {
434+
s1 := rd1[ipair.Index1]
435+
if mi, exists := mergedIndex[s1]; !exists {
436+
mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, -1}
437+
} else {
438+
mergedIndex[s1] = MergedIndex{walkIndex, ipair.Index1, mi.Second}
439+
}
440+
}
441+
if ipair.Index2 >= 0 {
442+
s2 := rd2[ipair.Index2]
443+
if mi, exists := mergedIndex[s2]; !exists {
444+
mergedIndex[s2] = MergedIndex{walkIndex, -1, ipair.Index2}
445+
} else {
446+
mergedIndex[s2] = MergedIndex{walkIndex, mi.First, ipair.Index2}
447+
}
448+
}
449+
}
450+
}
451+
return mergedIndex, mergedStrings
452+
}
453+
295454
func init() {
296455
core.Registry.Register(&Detector{})
297456
}

internal/plumbing/identity/identity_test.go

+56-23
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,11 @@ func TestIdentityDetectorMeta(t *testing.T) {
4646
func TestIdentityDetectorConfigure(t *testing.T) {
4747
id := fixtureIdentityDetector()
4848
facts := map[string]interface{}{}
49-
m1 := map[string]int{}
50-
m2 := []string{}
49+
m1 := map[string]int{"one": 0}
50+
m2 := []string{"one"}
5151
facts[FactIdentityDetectorPeopleDict] = m1
5252
facts[FactIdentityDetectorReversedPeopleDict] = m2
53-
id.Configure(facts)
53+
assert.Nil(t, id.Configure(facts))
5454
assert.Equal(t, m1, facts[FactIdentityDetectorPeopleDict])
5555
assert.Equal(t, m2, facts[FactIdentityDetectorReversedPeopleDict])
5656
assert.Equal(t, id.PeopleDict, facts[FactIdentityDetectorPeopleDict])
@@ -66,15 +66,15 @@ Vadim|[email protected]`)
6666
delete(facts, FactIdentityDetectorPeopleDict)
6767
delete(facts, FactIdentityDetectorReversedPeopleDict)
6868
facts[ConfigIdentityDetectorPeopleDictPath] = tmpf.Name()
69-
id.Configure(facts)
69+
assert.Nil(t, id.Configure(facts))
7070
assert.Len(t, id.PeopleDict, 2)
7171
assert.Len(t, id.ReversedPeopleDict, 1)
7272
assert.Equal(t, id.ReversedPeopleDict[0], "Vadim")
7373
delete(facts, FactIdentityDetectorPeopleDict)
7474
delete(facts, FactIdentityDetectorReversedPeopleDict)
7575
id = fixtureIdentityDetector()
7676
id.PeopleDict = nil
77-
id.Configure(facts)
77+
assert.Nil(t, id.Configure(facts))
7878
assert.Equal(t, id.PeopleDict, facts[FactIdentityDetectorPeopleDict])
7979
assert.Equal(t, id.ReversedPeopleDict, facts[FactIdentityDetectorReversedPeopleDict])
8080
assert.Len(t, id.PeopleDict, 4)
@@ -85,7 +85,7 @@ Vadim|[email protected]`)
8585
delete(facts, FactIdentityDetectorReversedPeopleDict)
8686
id = fixtureIdentityDetector()
8787
id.ReversedPeopleDict = nil
88-
id.Configure(facts)
88+
assert.Nil(t, id.Configure(facts))
8989
assert.Equal(t, id.PeopleDict, facts[FactIdentityDetectorPeopleDict])
9090
assert.Equal(t, id.ReversedPeopleDict, facts[FactIdentityDetectorReversedPeopleDict])
9191
assert.Len(t, id.PeopleDict, 4)
@@ -108,7 +108,7 @@ Vadim|[email protected]`)
108108
id = fixtureIdentityDetector()
109109
id.PeopleDict = nil
110110
id.ReversedPeopleDict = nil
111-
id.Configure(facts)
111+
assert.Nil(t, id.Configure(facts))
112112
assert.Equal(t, id.PeopleDict, facts[FactIdentityDetectorPeopleDict])
113113
assert.Equal(t, id.ReversedPeopleDict, facts[FactIdentityDetectorReversedPeopleDict])
114114
assert.True(t, len(id.PeopleDict) >= 3)
@@ -381,26 +381,59 @@ func TestIdentityDetectorGeneratePeopleDictMailmap(t *testing.T) {
381381
"strange guy|vadim markovtsev|[email protected]|[email protected]")
382382
}
383383

384-
func TestIdentityDetectorMergeReversedDicts(t *testing.T) {
385-
pa1 := [...]string{"one", "two"}
386-
pa2 := [...]string{"two", "three"}
387-
people, merged := Detector{}.MergeReversedDicts(pa1[:], pa2[:])
384+
func TestIdentityDetectorMergeReversedDictsLiteral(t *testing.T) {
385+
pa1 := [...]string{"one|one@one", "two|aaa@two"}
386+
pa2 := [...]string{"two|aaa@two", "three|one@one"}
387+
people, merged := MergeReversedDictsLiteral(pa1[:], pa2[:])
388388
assert.Len(t, people, 3)
389389
assert.Len(t, merged, 3)
390-
assert.Equal(t, people["one"], [3]int{0, 0, -1})
391-
assert.Equal(t, people["two"], [3]int{1, 1, 0})
392-
assert.Equal(t, people["three"], [3]int{2, -1, 1})
393-
vm := [...]string{"one", "two", "three"}
394-
assert.Equal(t, merged, vm[:])
395-
pa1 = [...]string{"two", "one"}
396-
people, merged = Detector{}.MergeReversedDicts(pa1[:], pa2[:])
390+
assert.Equal(t, people["one|one@one"], MergedIndex{0, 0, -1})
391+
assert.Equal(t, people["two|aaa@two"], MergedIndex{1, 1, 0})
392+
assert.Equal(t, people["three|one@one"], MergedIndex{2, -1, 1})
393+
assert.Equal(t, merged, []string{"one|one@one", "two|aaa@two", "three|one@one"})
394+
pa1 = [...]string{"two|aaa@two", "one|one@one"}
395+
people, merged = MergeReversedDictsLiteral(pa1[:], pa2[:])
397396
assert.Len(t, people, 3)
398397
assert.Len(t, merged, 3)
399-
assert.Equal(t, people["one"], [3]int{1, 1, -1})
400-
assert.Equal(t, people["two"], [3]int{0, 0, 0})
401-
assert.Equal(t, people["three"], [3]int{2, -1, 1})
402-
vm = [...]string{"two", "one", "three"}
403-
assert.Equal(t, merged, vm[:])
398+
assert.Equal(t, people["one|one@one"], MergedIndex{1, 1, -1})
399+
assert.Equal(t, people["two|aaa@two"], MergedIndex{0, 0, 0})
400+
assert.Equal(t, people["three|one@one"], MergedIndex{2, -1, 1})
401+
assert.Equal(t, merged, []string{"two|aaa@two", "one|one@one", "three|one@one"})
402+
}
403+
404+
func TestIdentityDetectorMergeReversedDictsIdentities(t *testing.T) {
405+
pa1 := [...]string{"one|one@one", "two|aaa@two"}
406+
pa2 := [...]string{"two|aaa@two", "three|one@one"}
407+
people, merged := MergeReversedDictsIdentities(pa1[:], pa2[:])
408+
assert.Len(t, people, 3)
409+
assert.Len(t, merged, 2)
410+
assert.Equal(t, people["one|one@one"], MergedIndex{0, 0, -1})
411+
assert.Equal(t, people["two|aaa@two"], MergedIndex{1, 1, 0})
412+
assert.Equal(t, people["three|one@one"], MergedIndex{0, -1, 1})
413+
assert.Equal(t, merged, []string{"one|three|one@one", "two|aaa@two"})
414+
}
415+
416+
func TestIdentityDetectorMergeReversedDictsIdentitiesStrikeBack(t *testing.T) {
417+
pa1 := [...]string{"one|one@one", "two|aaa@two", "three|three@three"}
418+
pa2 := [...]string{"two|aaa@two", "three|one@one"}
419+
people, merged := MergeReversedDictsIdentities(pa1[:], pa2[:])
420+
assert.Len(t, people, 4)
421+
assert.Len(t, merged, 2)
422+
assert.Equal(t, people["one|one@one"], MergedIndex{0, 0, -1})
423+
assert.Equal(t, people["two|aaa@two"], MergedIndex{1, 1, 0})
424+
assert.Equal(t, people["three|one@one"], MergedIndex{0, -1, 1})
425+
assert.Equal(t, people["three|three@three"], MergedIndex{0, 2, -1})
426+
assert.Equal(t, merged, []string{"one|three|one@one|three@three", "two|aaa@two"})
427+
428+
pa1 = [...]string{"one|one@one", "two|aaa@two", "three|aaa@two"}
429+
people, merged = MergeReversedDictsIdentities(pa1[:], pa2[:])
430+
assert.Len(t, people, 4)
431+
assert.Len(t, merged, 1)
432+
assert.Equal(t, people["one|one@one"], MergedIndex{0, 0, -1})
433+
assert.Equal(t, people["two|aaa@two"], MergedIndex{0, 1, 0})
434+
assert.Equal(t, people["three|one@one"], MergedIndex{0, -1, 1})
435+
assert.Equal(t, people["three|aaa@two"], MergedIndex{0, 2, -1})
436+
assert.Equal(t, merged, []string{"one|three|two|aaa@two|one@one"})
404437
}
405438

406439
func TestIdentityDetectorFork(t *testing.T) {

0 commit comments

Comments
 (0)