Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Branch free parse digits #1432

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -255,7 +255,7 @@ public enum MySQLCollation {
public static final List<String> SUPPORTED_CHARSET_NAMES = Arrays.stream(values()).map(MySQLCollation::mysqlCharsetName).distinct().collect(Collectors.toList());

private static final Map<String, String> charsetToDefaultCollationMapping = new HashMap<>();
private static final IntObjectMap<Charset> idToJavaCharsetMapping = new IntObjectHashMap<>();
private static final Charset[] idToJavaCharsetMapping = new Charset[256];

static {
charsetToDefaultCollationMapping.put("big5", "big5_chinese_ci");
@@ -303,11 +303,17 @@ public enum MySQLCollation {
for (MySQLCollation collation : MySQLCollation.values()) {
try {
Charset charset = Charset.forName(collation.mappedJavaCharsetName);
idToJavaCharsetMapping.put(collation.collationId, charset);
idToJavaCharsetMapping[collation.collationId] = charset;
} catch (Exception e) {
LOGGER.warn(String.format("Java charset: [%s] is not supported by this platform, data with collation[%s] will be decoded in UTF-8 instead.", collation.mysqlCharsetName, collation.name()));
}
}
// set the remaining missing ones to the default charset
for (int i = 0; i < idToJavaCharsetMapping.length; i++) {
if (idToJavaCharsetMapping[i] == null) {
idToJavaCharsetMapping[i] = StandardCharsets.UTF_8;
}
}
}

public static final MySQLCollation DEFAULT_COLLATION = utf8mb4_general_ci;
@@ -337,12 +343,10 @@ public static MySQLCollation valueOfName(String collationName) throws IllegalArg
* @return the charset
*/
public static Charset getJavaCharsetByCollationId(int collationId) {
Charset charset = idToJavaCharsetMapping.get(collationId);
if (charset == null) {
if (collationId >= idToJavaCharsetMapping.length) {
return StandardCharsets.UTF_8;
} else {
return charset;
}
return idToJavaCharsetMapping[collationId];
}

public static String getDefaultCollationFromCharsetName(String charset) {
Original file line number Diff line number Diff line change
@@ -12,35 +12,185 @@
package io.vertx.sqlclient.impl.codec;

import io.netty.buffer.ByteBuf;
import io.netty.buffer.Unpooled;

/**
* This is based on this algorithm: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
* Which can be explained as follows:
* <pre>
*
* Given 8 ASCII digits as: b1b2b3b4b5b6b7b8
*
* eg: "12345678"
*
* which byte[] := { 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38 }
* "1" "2" "3" "4" "5" "6" "7" "8"
*
* reading is in Little Endian form will read the lower indexes first placing them as lower addresses
* as can be seen in the hex representation, where "1" is now in the rightmost position.
*
* hex:
* 0x38_37_36_35_34_33_32_31-
* 0x30_30_30_30_30_30_30_30=
* 0x80_07_06_05_04_03_02_01
*
* digits = (digits * 10) + (digits >> 8);
*
* 0x50_46_3c_32_28_1e_14_0a + ~ digits * 10 +
* 0x00_80_70_60_50_40_30_21 = ~ digits >> 8 =
* 0x50_4e_43_38_2d_22_17_0c
* now ^ ^ ^ ^
* | | | |
* g4=10*b7+b8 | | |
* g3=10*b5+b6 | |
* g2=10*b3+b4 |
* g1=10*b1+b2
*
* These are the relevant results we care about, while the others are useless
* and subsequent masks will take care to ignore them.
*
* Now the aggregation parts:
*
* digits & U64_MASK := 0x00_00_00_38_00_00_00_0c
*
* This will isolate 10*b3+b4 and 10*b7+b8, trying to correctly compute:
*
* 1000000*(10*b1+b2) + 100*(10*b5+b6), somehow.
*
* The mask used to perform the multiplication is
*
* U64_FIRST_THIRD := (1000000L << 32) + 100 := 0x00_0f_42_40_00_00_00_64
*
* 0x00_00_00_38_00_00_00_0c *
* 0x00_0f_42_40_00_00_00_64 =
* 0x00_b7_30_e0_00_00_04_b0
*
*
* which 0x00_b7_30_e0 part (let's ignore the second half 00_00_04_b0, which is 1200)
*
* is, in decimal:
*
* 12005600 (!!!) === 1000000*(10*1 + 2) + 100*(10*5 + 6) = 1*10000000 + 2*1000000 + 5*1000 + 6*100
*
* For the second part
*
* ie 10000*(10*b3 + b4) + 10*b7 + b8
*
* we first isolate the 2 paris (g2 and g4) with (digits >> 16) & U64_MASK (which move them to the right by 2 bytes), getting
*
* 0x00_00_4e_00_00_00_22 *
* 0x00_27_10_00_00_00_01 =
* 0x05_30_6e_00_00_00_22
*
* which, once again, has it leftmost part
*
* 0x05_30_6e === 340078 (!!!!) === 10000*(10*3 + 4) + 10*7 + 8 = 3*100000 + 4*10000 + 7*10 + 8
*
*
* shifting both left by 32 and adding them, the total digit is done.
* </pre>
*/
public class CommonCodec {
/**
* Decode the specified {@code buff} formatted as a decimal string starting at the readable index
* with the specified {@code length} to a long.
*
* @param index the hex string index
* @param len the hex string length
* @param buff the byte buff to read from
* @return the decoded value as a long
*/

// https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
private static final long U64_MASK = 0x000000FF000000FFL;
private static final long U64_FIRST_THIRD = (1000000L << 32) + 100;
private static final long U64_SECOND_FOURTH = (10000L << 32) + 1;

private static long parseEigthDigitsLE(long digits) {
digits -= 0x3030303030303030L;
digits = (digits * 10) + (digits >> 8);
return (((digits & U64_MASK) * U64_FIRST_THIRD) + (((digits >> 16) & U64_MASK) * U64_SECOND_FOURTH)) >> 32;
}

private static final int U32_MASK = 0x00FF00FF;
private static final int U32_FIRST_SECOND = (100 << 16) + 1;

private static int parseFourDigitsLE(int digits) {
digits -= 0x30303030;
digits = (digits * 10) + (digits >> 8);
return ((digits & U32_MASK) * U32_FIRST_SECOND) >> 16;
}

private static short parseTwoDigitsLE(short digits) {
digits -= 0x3030;
return (short) ((digits & 0xFF) * 10 + ((digits >> 8) & 0xFF));
}

private static byte parseOneDigit(byte digit) {
return (byte) (digit - 0x30);
}

public static void main(String[] args) {
ByteBuf buff = Unpooled.buffer();
buff.writeCharSequence("-123", java.nio.charset.StandardCharsets.UTF_8);
System.out.println(decodeDecStringToLong(0, buff.readableBytes(), buff));
}

public static int decodeDecStringToInt(int index, int len, ByteBuf buff) {
// 10 + sign = 32bit
return 0;
}

public static int decodeDecStringToShort(int index, int len, ByteBuf buff) {
// 5 + sign = 16bit
return 0;
}

public static long decodeDecStringToLong(int index, int len, ByteBuf buff) {
long value = 0;
if (len > 0) {
int to = index + len;
boolean neg = false;
if (buff.getByte(index) == '-') {
neg = true;
index++;
}
while (index < to) {
byte ch = buff.getByte(index++);
byte nibble = (byte) (ch - '0');
value = value * 10 + nibble;
byte firstByte = buff.getByte(index);
final boolean negative = firstByte == '-';
// handling these fast-path to avoid using get<something>LE which is not free
if (len <= 2) {
if (len == 1) {
if (negative) {
throw new IllegalArgumentException("Invalid negative number: missing digits");
}
return parseOneDigit(firstByte);
}
if (neg) {
value = -value;
assert len == 2;
if (negative) {
return -parseOneDigit(buff.getByte(index + 1));
}
return parseOneDigit(firstByte) * 10 + parseOneDigit(buff.getByte(index + 1));
}
if (negative) {
index++;
len--;
}
long lessThanEight = len % 8;
if (lessThanEight > 0) {
return lessThanEightDigitsUnrolled(negative, index, len, buff);
}
throw new UnsupportedOperationException("Not implemented yet");
}

private static int lessThanEightDigitsUnrolled(boolean negative, int index, int len, ByteBuf buff) {
assert len > 0 && len < 8;
int digits = 0;
int multiplier = 1;
// len >= 4
if ((len & Integer.BYTES) != 0) {
digits = parseFourDigitsLE(buff.getIntLE(index));
index += Integer.BYTES;
multiplier = 100;
}
// len >= 2
if ((len & Short.BYTES) != 0) {
digits = digits * multiplier + parseTwoDigitsLE(buff.getShortLE(index));
index += Short.BYTES;
multiplier = 10;
}
// len >= 1
if ((len & Byte.BYTES) != 0) {
digits = digits * multiplier + parseOneDigit(buff.getByte(index));
}
return value;
return negative ? -digits : digits;
}

public static byte decodeDecStringToByte(int index, int len, ByteBuf buff) {
// 3 + sign = 8bit
return 0;
}

}