Skip to content

Commit 17fb758

Browse files
authored
Merge pull request #171 from vmarkovtsev/master
Add binary renames detection
2 parents 008ced6 + bacd9fc commit 17fb758

File tree

5 files changed

+389
-2
lines changed

5 files changed

+389
-2
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ before_install:
4848
- unzip -d ~/.local protoc.zip && rm protoc.zip
4949
- go get -v golang.org/x/lint/golint
5050
- go get -v github.com/golang/dep/cmd/dep
51-
- (wget https://bootstrap.pypa.io/get-pip.py || wget https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) && python3 get-pip.py --user && rm get-pip.py
51+
- (wget -O - https://bootstrap.pypa.io/get-pip.py || wget -O - https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) | python3 - --user pip==18.1
5252
- export PATH=~/usr/bin:$GOPATH/bin:$PATH
5353
- make --version
5454
- pip3 --version

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ echo\n\
2828
echo " $@"\n\
2929
echo\n\' > /browser && \
3030
chmod +x /browser && \
31-
curl https://bootstrap.pypa.io/get-pip.py | python3 && \
31+
curl https://bootstrap.pypa.io/get-pip.py | python3 - pip==18.1 && \
3232
pip3 install --no-cache-dir --no-build-isolation cython && \
3333
pip3 install --no-cache-dir --no-build-isolation -r /root/src/gopkg.in/src-d/hercules.v6/requirements.txt https://github.com/mind/wheels/releases/download/tf1.7-cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl && \
3434
rm -rf /root/* && \

internal/plumbing/bsdiff.go

+302
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
package plumbing
2+
3+
// Adapted from https://github.com/kr/binarydist
4+
// Original license:
5+
//
6+
// Copyright 2012 Keith Rarick
7+
//
8+
// Permission is hereby granted, free of charge, to any person
9+
// obtaining a copy of this software and associated documentation
10+
// files (the "Software"), to deal in the Software without
11+
// restriction, including without limitation the rights to use,
12+
// copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
// copies of the Software, and to permit persons to whom the
14+
// Software is furnished to do so, subject to the following
15+
// conditions:
16+
//
17+
// The above copyright notice and this permission notice shall be
18+
// included in all copies or substantial portions of the Software.
19+
//
20+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21+
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22+
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23+
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24+
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25+
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26+
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27+
// OTHER DEALINGS IN THE SOFTWARE.
28+
29+
import (
30+
"bytes"
31+
)
32+
33+
func swap(a []int, i, j int) { a[i], a[j] = a[j], a[i] }
34+
35+
func split(I, V []int, start, length, h int) {
36+
var i, j, k, x, jj, kk int
37+
38+
if length < 16 {
39+
for k = start; k < start+length; k += j {
40+
j = 1
41+
x = V[I[k]+h]
42+
for i = 1; k+i < start+length; i++ {
43+
if V[I[k+i]+h] < x {
44+
x = V[I[k+i]+h]
45+
j = 0
46+
}
47+
if V[I[k+i]+h] == x {
48+
swap(I, k+i, k+j)
49+
j++
50+
}
51+
}
52+
for i = 0; i < j; i++ {
53+
V[I[k+i]] = k + j - 1
54+
}
55+
if j == 1 {
56+
I[k] = -1
57+
}
58+
}
59+
return
60+
}
61+
62+
x = V[I[start+length/2]+h]
63+
jj = 0
64+
kk = 0
65+
for i = start; i < start+length; i++ {
66+
if V[I[i]+h] < x {
67+
jj++
68+
}
69+
if V[I[i]+h] == x {
70+
kk++
71+
}
72+
}
73+
jj += start
74+
kk += jj
75+
76+
i = start
77+
j = 0
78+
k = 0
79+
for i < jj {
80+
if V[I[i]+h] < x {
81+
i++
82+
} else if V[I[i]+h] == x {
83+
swap(I, i, jj+j)
84+
j++
85+
} else {
86+
swap(I, i, kk+k)
87+
k++
88+
}
89+
}
90+
91+
for jj+j < kk {
92+
if V[I[jj+j]+h] == x {
93+
j++
94+
} else {
95+
swap(I, jj+j, kk+k)
96+
k++
97+
}
98+
}
99+
100+
if jj > start {
101+
split(I, V, start, jj-start, h)
102+
}
103+
104+
for i = 0; i < kk-jj; i++ {
105+
V[I[jj+i]] = kk - 1
106+
}
107+
if jj == kk-1 {
108+
I[jj] = -1
109+
}
110+
111+
if start+length > kk {
112+
split(I, V, kk, start+length-kk, h)
113+
}
114+
}
115+
116+
func qsufsort(obuf []byte) []int {
117+
var buckets [256]int
118+
var i, h int
119+
I := make([]int, len(obuf)+1)
120+
V := make([]int, len(obuf)+1)
121+
122+
for _, c := range obuf {
123+
buckets[c]++
124+
}
125+
for i = 1; i < 256; i++ {
126+
buckets[i] += buckets[i-1]
127+
}
128+
copy(buckets[1:], buckets[:])
129+
buckets[0] = 0
130+
131+
for i, c := range obuf {
132+
buckets[c]++
133+
I[buckets[c]] = i
134+
}
135+
136+
I[0] = len(obuf)
137+
for i, c := range obuf {
138+
V[i] = buckets[c]
139+
}
140+
141+
V[len(obuf)] = 0
142+
for i = 1; i < 256; i++ {
143+
if buckets[i] == buckets[i-1]+1 {
144+
I[buckets[i]] = -1
145+
}
146+
}
147+
I[0] = -1
148+
149+
for h = 1; I[0] != -(len(obuf) + 1); h += h {
150+
var n int
151+
for i = 0; i < len(obuf)+1; {
152+
if I[i] < 0 {
153+
n -= I[i]
154+
i -= I[i]
155+
} else {
156+
if n != 0 {
157+
I[i-n] = -n
158+
}
159+
n = V[I[i]] + 1 - i
160+
split(I, V, i, n, h)
161+
i += n
162+
n = 0
163+
}
164+
}
165+
if n != 0 {
166+
I[i-n] = -n
167+
}
168+
}
169+
170+
for i = 0; i < len(obuf)+1; i++ {
171+
I[V[i]] = i
172+
}
173+
return I
174+
}
175+
176+
func matchlen(a, b []byte) (i int) {
177+
for i < len(a) && i < len(b) && a[i] == b[i] {
178+
i++
179+
}
180+
return i
181+
}
182+
183+
func search(I []int, obuf, nbuf []byte, st, en int) (pos, n int) {
184+
if en-st < 2 {
185+
x := matchlen(obuf[I[st]:], nbuf)
186+
y := matchlen(obuf[I[en]:], nbuf)
187+
188+
if x > y {
189+
return I[st], x
190+
}
191+
return I[en], y
192+
}
193+
194+
x := st + (en-st)/2
195+
if bytes.Compare(obuf[I[x]:], nbuf) < 0 {
196+
return search(I, obuf, nbuf, x, en)
197+
}
198+
return search(I, obuf, nbuf, st, x)
199+
}
200+
201+
// DiffBytes calculates the approximated number of different bytes between two binary buffers.
202+
// We are not interested in the diff script itself. Instead, we track the sizes of `db` and `eb`
203+
// from the original implementation.
204+
func DiffBytes(obuf, nbuf []byte) int {
205+
if len(nbuf) < len(obuf) {
206+
obuf, nbuf = nbuf, obuf
207+
}
208+
var lenf int
209+
I := qsufsort(obuf)
210+
var dblen, eblen int
211+
212+
// Compute the differences, writing ctrl as we go
213+
var scan, pos, length int
214+
var lastscan, lastpos, lastoffset int
215+
for scan < len(nbuf) {
216+
var oldscore int
217+
scan += length
218+
for scsc := scan; scan < len(nbuf); scan++ {
219+
pos, length = search(I, obuf, nbuf[scan:], 0, len(obuf))
220+
221+
for ; scsc < scan+length; scsc++ {
222+
if scsc+lastoffset < len(obuf) &&
223+
obuf[scsc+lastoffset] == nbuf[scsc] {
224+
oldscore++
225+
}
226+
}
227+
228+
if (length == oldscore && length != 0) || length > oldscore+8 {
229+
break
230+
}
231+
232+
if scan+lastoffset < len(obuf) && obuf[scan+lastoffset] == nbuf[scan] {
233+
oldscore--
234+
}
235+
}
236+
237+
if length != oldscore || scan == len(nbuf) {
238+
var s, Sf int
239+
lenf = 0
240+
for i := 0; lastscan+i < scan && lastpos+i < len(obuf); {
241+
if obuf[lastpos+i] == nbuf[lastscan+i] {
242+
s++
243+
}
244+
i++
245+
if s*2-i > Sf*2-lenf {
246+
Sf = s
247+
lenf = i
248+
}
249+
}
250+
251+
lenb := 0
252+
if scan < len(nbuf) {
253+
var s, Sb int
254+
for i := 1; (scan >= lastscan+i) && (pos >= i); i++ {
255+
if obuf[pos-i] == nbuf[scan-i] {
256+
s++
257+
}
258+
if s*2-i > Sb*2-lenb {
259+
Sb = s
260+
lenb = i
261+
}
262+
}
263+
}
264+
265+
if lastscan+lenf > scan-lenb {
266+
overlap := (lastscan + lenf) - (scan - lenb)
267+
s := 0
268+
Ss := 0
269+
lens := 0
270+
for i := 0; i < overlap; i++ {
271+
if nbuf[lastscan+lenf-overlap+i] == obuf[lastpos+lenf-overlap+i] {
272+
s++
273+
}
274+
if nbuf[scan-lenb+i] == obuf[pos-lenb+i] {
275+
s--
276+
}
277+
if s > Ss {
278+
Ss = s
279+
lens = i + 1
280+
}
281+
}
282+
283+
lenf += lens - overlap
284+
lenb -= lens
285+
}
286+
287+
var nonzero int
288+
for i := 0; i < lenf; i++ {
289+
if nbuf[lastscan+i]-obuf[lastpos+i] != 0 {
290+
nonzero++
291+
}
292+
}
293+
294+
dblen += nonzero
295+
eblen += (scan - lenb) - (lastscan + lenf)
296+
lastscan = scan - lenb
297+
lastpos = pos - lenb
298+
lastoffset = pos - scan
299+
}
300+
}
301+
return dblen + eblen
302+
}

internal/plumbing/renames.go

+12
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,15 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
367367
panic(err)
368368
}
369369
}()
370+
_, err1 := blob1.CountLines()
371+
_, err2 := blob2.CountLines()
372+
if err1 == ErrorBinary || err2 == ErrorBinary {
373+
// binary mode
374+
bsdifflen := DiffBytes(blob1.Data, blob2.Data)
375+
delta := int((int64(bsdifflen) * 100) / internal.Max64(
376+
internal.Min64(blob1.Size, blob2.Size), 1))
377+
return 100-delta >= ra.SimilarityThreshold, nil
378+
}
370379
src, dst := string(blob1.Data), string(blob2.Data)
371380
maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
372381

@@ -412,6 +421,9 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
412421
posDst += step
413422
}
414423
}
424+
if possibleDelInsBlock {
425+
continue
426+
}
415427
// supposing that the rest of the lines are the same (they are not - too optimistic),
416428
// estimate the maximum similarity and exit the loop if it lower than our threshold
417429
maxCommon := common + internal.Min(

0 commit comments

Comments
 (0)