|
1 | 1 | package com.liveramp.hyperminhash;
|
2 | 2 |
|
3 | 3 | import java.nio.ByteBuffer;
|
4 |
| - |
5 |
| -import util.hash.MetroHash128; |
| 4 | +import java.util.Arrays; |
6 | 5 |
|
7 | 6 | /**
|
8 | 7 | * Implementation of HyperMinHash described in Yu and Weber: https://arxiv.org/pdf/1710.08436.pdf.
|
9 |
| - * This class implements LogLog-Beta described in Qin, Kim, et al. here: https://arxiv.org/pdf/1612.02284.pdf. |
10 |
| - * Loglog-Beta is almost identical in accuracy to HyperLogLog and HyperLogLog++ except it performs better on cardinality |
11 |
| - * estimations for small datasets (n <= 200_000). It's also much simpler to implement. |
| 8 | + * This class implements LogLog-Beta described in Qin, Kim, et al. here: |
| 9 | + * https://arxiv.org/pdf/1612.02284.pdf. Loglog-Beta is almost identical in accuracy to HyperLogLog |
| 10 | + * and HyperLogLog++ except it performs better on cardinality estimations for small datasets (n <= |
| 11 | + * 200_000). It's also much simpler to implement. |
| 12 | + * <p> |
| 13 | + * The log log implementation uses the values of p and beta coefficients tested in the Loglog-beta |
| 14 | + * paper. It's possible to use different values of P but we'd need to recompute the beta |
| 15 | + * coefficients which is a computationally intensive process. So for now, this impl doesn't support |
| 16 | + * using different values of P. This being said the current value of P works with high accuracy for |
| 17 | + * very large cardinalities and small jaccard indices. See the paper for more details. |
12 | 18 | * <p>
|
13 |
| - * The log log implementation uses the values of p and beta coefficients tested in the Loglog-beta paper. It's possible |
14 |
| - * to use different values of P but we'd need to recompute the beta coefficients which is a computationally intensive |
15 |
| - * process. So for now, this impl doesn't support using different values of P. This being said the current value of P |
16 |
| - * works with high accuracy for very large cardinalities and small jaccard indices. See the paper for more details. |
17 | 19 | * <p>
|
| 20 | + * Similarly, we use values of Q and R suggested in the HyperMinHash paper. Those are theoretically |
| 21 | + * changeable, but the current values should provide sufficient accuracy for set cardinalities up to |
| 22 | + * 2^89 (see Hyperminhash paper for reference). |
18 | 23 | * <p>
|
19 |
| - * Similarly, we use values of Q and R suggested in the HyperMinHash paper. Those are theoretically changeable, but the |
20 |
| - * current values should provide sufficient accuracy for set cardinalities up to 2^89 (see Hyperminhash paper for |
21 |
| - * reference). |
| 24 | + * If you want to be able to combine multiple BetaMinHash instances, or compute their intersection, |
| 25 | + * you can use {@link BetaMinHashCombiner}. |
22 | 26 | * <p>
|
23 | 27 | * If you'd like this class to support custom Q or R or P values, please open a github issue.
|
24 | 28 | * <p>
|
25 | 29 | */
|
26 |
| -public class BetaMinHash { |
27 |
| - // HLL Precision parameter |
28 |
| - public final static int P = 14; |
29 |
| - public final static int NUM_REGISTERS = (int)Math.pow(2, P); |
| 30 | +public class BetaMinHash implements IntersectionSketch<BetaMinHash> { |
30 | 31 |
|
| 32 | + // HLL Precision parameter |
| 33 | + public static final int P = 14; |
| 34 | + public static final int NUM_REGISTERS = (int) Math.pow(2, P); |
31 | 35 |
|
32 | 36 | // TODO add actual validation if necessary
|
33 | 37 | // Q + R must always be <= 16 since we're packing values into 16 bit registers
|
34 |
| - public final static int Q = 6; |
35 |
| - public final static int R = 10; |
| 38 | + public static final int Q = 6; |
| 39 | + public static final int R = 10; |
| 40 | + |
| 41 | + private static final int HASH_SEED = 1337; |
| 42 | + static final byte VERSION = 1; |
36 | 43 |
|
37 | 44 | final short[] registers;
|
38 | 45 |
|
39 | 46 | public BetaMinHash() {
|
40 | 47 | registers = new short[NUM_REGISTERS];
|
41 | 48 | }
|
42 | 49 |
|
43 |
| - BetaMinHash(short[] registers) { |
44 |
| - this(); |
45 |
| - System.arraycopy(registers, 0, this.registers, 0, registers.length); |
46 |
| - } |
47 |
| - |
48 |
| - public BetaMinHash(BetaMinHash other) { |
49 |
| - this(other.registers); |
| 50 | + private BetaMinHash(short[] registers) { |
| 51 | + this.registers = registers; |
50 | 52 | }
|
51 | 53 |
|
52 |
| - public void add(byte[] val) { |
53 |
| - MetroHash128 hash = new MetroHash128(1337).apply(ByteBuffer.wrap(val)); |
54 |
| - ByteBuffer buf = ByteBuffer.allocate(16); |
55 |
| - hash.writeBigEndian(buf); |
56 |
| - addHash(buf); |
57 |
| - } |
58 | 54 |
|
59 |
| - /** |
60 |
| - * @param _128BitHash |
61 |
| - */ |
62 |
| - private void addHash(ByteBuffer _128BitHash) { |
63 |
| - if (_128BitHash.array().length != 16) { |
64 |
| - throw new IllegalArgumentException("input hash should be 16 bytes"); |
| 55 | + static BetaMinHash deepCopyFromRegisters(short[] registers) { |
| 56 | + if (registers.length != NUM_REGISTERS) { |
| 57 | + throw new IllegalArgumentException(String.format( |
| 58 | + "Expected exactly %d registers, but there are %d", |
| 59 | + NUM_REGISTERS, |
| 60 | + registers.length)); |
65 | 61 | }
|
66 | 62 |
|
67 |
| - long hashLeftHalf = _128BitHash.getLong(0); |
68 |
| - long hashRightHalf = _128BitHash.getLong(8); |
69 |
| - |
70 |
| - int registerIndex = getLeftmostPBits(hashLeftHalf); |
71 |
| - short rBits = getRightmostRBits(hashLeftHalf); |
| 63 | + final short[] registersCopy = new short[NUM_REGISTERS]; |
| 64 | + System.arraycopy(registers, 0, registers, 0, NUM_REGISTERS); |
72 | 65 |
|
73 |
| - byte leftmostOneBitPosition = getLeftmostOneBitPosition(hashRightHalf); |
74 |
| - |
75 |
| - short packedRegister = packIntoRegister(leftmostOneBitPosition, rBits); |
76 |
| - if (registers[registerIndex] < packedRegister) { |
77 |
| - registers[registerIndex] = packedRegister; |
78 |
| - } |
| 66 | + return wrapRegisters(registersCopy); |
79 | 67 | }
|
80 | 68 |
|
81 |
| - private int getLeftmostPBits(long hash) { |
82 |
| - return (int)(hash >>> (Long.SIZE - P)); |
| 69 | + static BetaMinHash wrapRegisters(short[] registers) { |
| 70 | + return new BetaMinHash(registers); |
83 | 71 | }
|
84 | 72 |
|
85 |
| - /** |
86 |
| - * Finds the position of the leftmost one-bit in the first (2^Q)-1 bits. |
87 |
| - * |
88 |
| - * @param hash |
89 |
| - * @return |
90 |
| - */ |
91 |
| - private byte getLeftmostOneBitPosition(long hash) { |
92 |
| - // To find the position of the leftmost 1-bit in the first (2^Q)-1 bits |
93 |
| - // We zero out all bits to the right of the first (2^Q)-1 bits then add a |
94 |
| - // 1-bit in the 2^Qth position of the bits to search. This way if the bits we're |
95 |
| - // searching are all 0, we take the position of the leftmost 1-bit to be 2^Q |
96 |
| - int _2q = (1 << Q) - 1; |
97 |
| - int shiftAmount = (Long.SIZE - _2q); |
98 |
| - |
99 |
| - // zero all bits to the right of the first (2^Q)-1 bits |
100 |
| - long _2qSearchBits = ((hash >>> shiftAmount) << shiftAmount); |
101 |
| - |
102 |
| - // add a 1-bit in the 2^Qth position |
103 |
| - _2qSearchBits += (1 << (shiftAmount - 1)); |
104 |
| - |
105 |
| - return (byte)(Long.numberOfLeadingZeros(_2qSearchBits) + 1); |
| 73 | + @Override |
| 74 | + public long cardinality() { |
| 75 | + return BetaMinHashCardinalityGetter.cardinality(this); |
106 | 76 | }
|
107 | 77 |
|
108 |
| - private short getRightmostRBits(long hash) { |
109 |
| - return (short)(hash << (Long.SIZE - R) >>> Long.SIZE - R); |
| 78 | + @Override |
| 79 | + public boolean offer(byte[] val) { |
| 80 | + long[] _128BitHash = Murmur3.hash128(val); |
| 81 | + ByteBuffer buf = ByteBuffer.allocate(16); |
| 82 | + buf.putLong(_128BitHash[0]); |
| 83 | + buf.putLong(_128BitHash[1]); |
| 84 | + return addHash(buf); |
110 | 85 | }
|
111 | 86 |
|
112 |
| - /** |
113 |
| - * Creates a new tuple/register value for the LL-Beta by bit-packing the number |
114 |
| - * of leading zeros with the rightmost R bits. |
115 |
| - * |
116 |
| - * @param leftmostOnebitPosition |
117 |
| - * @param rightmostRBits |
118 |
| - * @return |
119 |
| - */ |
120 |
| - private short packIntoRegister(byte leftmostOnebitPosition, short rightmostRBits) { |
121 |
| - // Q is at most 6, which means that with R<=10, we should be able to store these two |
122 |
| - // numbers in the same register |
123 |
| - return (short)((leftmostOnebitPosition << R) | rightmostRBits); |
| 87 | + @Override |
| 88 | + public boolean equals(Object o) { |
| 89 | + if (this == o) { |
| 90 | + return true; |
| 91 | + } |
| 92 | + if (!(o instanceof BetaMinHash)) { |
| 93 | + return false; |
| 94 | + } |
| 95 | + BetaMinHash that = (BetaMinHash) o; |
| 96 | + return Arrays.equals(registers, that.registers); |
124 | 97 | }
|
125 | 98 |
|
126 |
| - public long cardinality() { |
127 |
| - return BetaMinHashCardinalityGetter.cardinality(this); |
| 99 | + @Override |
| 100 | + public int hashCode() { |
| 101 | + return Arrays.hashCode(registers); |
128 | 102 | }
|
129 | 103 |
|
130 |
| - /** |
131 |
| - * @return Merged sketch representing the input sketches |
132 |
| - */ |
133 |
| - public static BetaMinHash merge(BetaMinHash... sketches) { |
134 |
| - return BetaMinHashMergeGetter.merge(sketches); |
| 104 | + @Override |
| 105 | + public BetaMinHash deepCopy() { |
| 106 | + return deepCopyFromRegisters(this.registers); |
135 | 107 | }
|
136 | 108 |
|
137 | 109 | /**
|
138 |
| - * @return Union cardinality estimation |
| 110 | + * @param _128BitHash |
139 | 111 | */
|
140 |
| - public static long union(BetaMinHash... sketches) { |
141 |
| - return merge(sketches).cardinality(); |
142 |
| - } |
| 112 | + private boolean addHash(ByteBuffer _128BitHash) { |
| 113 | + if (_128BitHash.array().length != 16) { |
| 114 | + throw new IllegalArgumentException("input hash should be 16 bytes"); |
| 115 | + } |
143 | 116 |
|
144 |
| - /** |
145 |
| - * @return Intersection cardinality estimation |
146 |
| - */ |
147 |
| - public static long intersection(BetaMinHash... sketches) { |
148 |
| - return BetaMinHashIntersectionGetter.getIntersection(sketches); |
| 117 | + long hashLeftHalf = _128BitHash.getLong(0); |
| 118 | + int registerIndex = (int) BitHelper.getLeftmostBits(hashLeftHalf, P); |
| 119 | + short leftmostOneBitPosition = BitHelper.getLeftmostOneBitPosition(_128BitHash.array(), P, Q); |
| 120 | + /* We take the rightmost bits as what's called h_hat3 in the paper. Note that his differs from |
| 121 | + * the diagram in the paper which draws a parallel to a mantissa in a floating point |
| 122 | + * representation, but still satisfies the criterion of serving as an independent hash function |
| 123 | + * by selecting a set of independent bits from a larger hash. This is slightly simpler to |
| 124 | + * implement. */ |
| 125 | + short rBits = (short) BitHelper.getRightmostBits(_128BitHash.array(), R); |
| 126 | + |
| 127 | + short packedRegister = packIntoRegister(leftmostOneBitPosition, rBits); |
| 128 | + if (registers[registerIndex] < packedRegister) { |
| 129 | + registers[registerIndex] = packedRegister; |
| 130 | + return true; |
| 131 | + } |
| 132 | + |
| 133 | + return false; |
149 | 134 | }
|
150 | 135 |
|
151 | 136 | /**
|
152 |
| - * @return Jaccard index estimation |
| 137 | + * Creates a new tuple/register value for the LL-Beta by bit-packing the number of leading zeros |
| 138 | + * with the rightmost R bits. |
153 | 139 | */
|
154 |
| - public static double similarity(BetaMinHash... sketches) { |
155 |
| - return BetaMinHashSimilarityGetter.similarity(sketches); |
| 140 | + private short packIntoRegister(short leftmostOnebitPosition, short rightmostRBits) { |
| 141 | + // Q is at most 6, which means that with R<=10, we should be able to store these two |
| 142 | + // numbers in the same register |
| 143 | + final int exponent = leftmostOnebitPosition << R; |
| 144 | + final int packedRegister = (exponent | rightmostRBits); |
| 145 | + return (short) packedRegister; |
156 | 146 | }
|
157 | 147 | }
|
0 commit comments