-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathimohash.go
140 lines (117 loc) · 3.45 KB
/
imohash.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// Package imohash implements a fast, constant-time hash for files. It is based atop
// murmurhash3 and uses file size and sample data to construct the hash.
//
// For more information, including important caveats on usage, consult https://github.com/kalafut/imohash.
package imohash
import (
"bytes"
"encoding/binary"
"io"
"os"
"github.com/twmb/murmur3"
)
const Size = 16
// Files smaller than this will be hashed in their entirety.
const SampleThreshold = 128 * 1024
const SampleSize = 16 * 1024
var emptyArray = [Size]byte{}
type ImoHash struct {
hasher murmur3.Hash128
sampleSize int
sampleThreshold int
}
// New returns a new ImoHash using the default sample size
// and sample threshhold values.
func New() ImoHash {
return NewCustom(SampleSize, SampleThreshold)
}
// NewCustom returns a new ImoHash using the provided sample size
// and sample threshhold values. The entire file will be hashed
// (i.e. no sampling), if sampleSize < 1.
func NewCustom(sampleSize, sampleThreshold int) ImoHash {
h := ImoHash{
hasher: murmur3.New128(),
sampleSize: sampleSize,
sampleThreshold: sampleThreshold,
}
return h
}
// SumFile hashes a file using default sample parameters.
func SumFile(filename string) ([Size]byte, error) {
imo := New()
return imo.SumFile(filename)
}
// Sum hashes a byte slice using default sample parameters.
func Sum(data []byte) [Size]byte {
imo := New()
return imo.Sum(data)
}
// SumSectionReader hashes a SectionReader using default sample parameters.
func SumSectionReader(sr *io.SectionReader) ([Size]byte, error) {
imo := New()
return imo.hashCore(sr)
}
// Sum hashes a byte slice using the ImoHash parameters.
func (imo *ImoHash) Sum(data []byte) [Size]byte {
sr := io.NewSectionReader(bytes.NewReader(data), 0, int64(len(data)))
result, err := imo.hashCore(sr)
if err != nil {
panic(err)
}
return result
}
// SumFile hashes a file using using the ImoHash parameters.
func (imo *ImoHash) SumFile(filename string) ([Size]byte, error) {
f, err := os.Open(filename)
if err != nil {
return emptyArray, err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return emptyArray, err
}
sr := io.NewSectionReader(f, 0, fi.Size())
return imo.hashCore(sr)
}
// SumSectionReader hashes a SectionReader using the ImoHash parameters.
func (imo *ImoHash) SumSectionReader(f *io.SectionReader) ([Size]byte, error) {
return imo.hashCore(f)
}
// hashCore hashes a SectionReader using the ImoHash parameters.
func (imo *ImoHash) hashCore(f *io.SectionReader) ([Size]byte, error) {
var result [Size]byte
imo.hasher.Reset()
msgLen := f.Size()
if imo.sampleSize < 1 ||
msgLen < int64(imo.sampleThreshold) ||
msgLen < int64(4*imo.sampleSize) {
if _, err := io.Copy(imo.hasher, f); err != nil {
return emptyArray, err
}
} else {
buffer := make([]byte, imo.sampleSize)
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer) // these Writes never fail
if _, err := f.Seek(f.Size()/2, 0); err != nil {
return emptyArray, err
}
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
if _, err := f.Seek(int64(-imo.sampleSize), 2); err != nil {
return emptyArray, err
}
if _, err := io.ReadFull(f, buffer); err != nil {
return emptyArray, err
}
imo.hasher.Write(buffer)
}
hash := imo.hasher.Sum(nil)
binary.PutUvarint(hash, uint64(f.Size()))
copy(result[:], hash)
return result, nil
}