-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbasehan.c
138 lines (114 loc) · 3.03 KB
/
basehan.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/**
* Copyright (C) 2017 Yuzhong Wen <[email protected]>
*
* basehan: An algorithm that maps bytes to Chinese characters.
* What we use here is from U+4E00 ~ U+8E00, which maps to CJK character space.
*
* Distributed under terms of the MIT license.
*/
/*
* Algorithm in detail:
* 1. Take the input string by every two bytes
* 2. Maps the bytes to 4E00-8E00 space, similar to what base64 does
* 3. Maps the transfromed bytes to UTF-8 space.
*
* CJK Symbols and Punctuation: U+3000 ~ U+303F
* Those characters will be chosen randomly as paddings
*/
#include <unistd.h>
static inline void to_utf8_3(unsigned short in, unsigned char *out)
{
out[0] = ((in >> 12) & 0x0F) | 0xE0;
out[1] = ((in >> 6) & 0x3F) | 0x80;
out[2] = (in & 0x3F) | 0x80;
}
static inline void encode_process(const unsigned char *in, unsigned char *out)
{
unsigned char *c;
unsigned long long *u;
unsigned short s;
int i;
u = (unsigned long long *) in;
c = (unsigned char *) in;
for (i = 0; i < 4; i++) {
s = 0x4E00 + (((*u) >> (i * 14)) & 0x3fff);
to_utf8_3(s, out + i * 3);
}
// The last byte goes to one hanzi
to_utf8_3(0x4E00 + in[7], out + 12);
// 5 hanzi in total -> 15bytes output
}
static inline unsigned short from_utf8_3(const unsigned char *in)
{
unsigned short ret = 0;
ret = ((unsigned short) (in[0] & 0x0F)) << 12;
ret |= ((unsigned short) (in[1] & 0x3F)) << 6;
ret |= ((unsigned short) (in[2] & 0x3F));
ret -= 0x4E00;
return ret;
}
static inline void decode_process(const unsigned char *in, unsigned char *out)
{
unsigned long long *c;
unsigned char *u;
unsigned short s;
int i;
// 5 hanzi in total -> 15bytes input
c = (unsigned long long *) out;
u = (unsigned char *) out;
// First 4 hanzi
for (i = 0; i < 4; i++) {
s = from_utf8_3(in + i * 3);
*c |= ((unsigned long long) (s & 0x3FFF)) << (i * 14);
}
// Last 1 hanzi
s = from_utf8_3(in + 12);
*c |= ((unsigned long long) (s & 0xFF)) << 56;
}
/**
* From normal bytes to CJK space
*/
int cjk_mapping_encode(const char *in, ssize_t in_len, char *out, ssize_t out_len)
{
int r_len = 0;
unsigned char padding[8] = {0};
int not_aligned = 0;
int i;
// Padding it to 8 bytes
not_aligned = in_len % 8;
if (out_len < 15 * (in_len / 8) || in_len < 1)
// Not enough output buffer
return -1;
memset(out, 0, out_len);
for (i = 0; i < in_len - not_aligned; i += 8) {
encode_process(in + i, out + r_len);
r_len += 15;
}
// If input is not aligned to 8 bytes, deal with the unaligned
// bytes now
if (not_aligned || in_len < 8) {
memcpy(padding, in + i, not_aligned);
encode_process(padding, out + r_len);
r_len += 15;
}
return r_len;
}
/**
* From CJK space to normal bytes
*/
int cjk_mapping_decode(const char *in, ssize_t in_len, char *out, ssize_t out_len)
{
int r_len = 0;
unsigned char padding[8] = {0};
int not_aligned = 0;
int i;
if (out_len < (in_len / 15) * 8 || in_len < 1)
// Not enough output buffer
return -1;
memset(out, 0, out_len);
for (i = 0; i < in_len; i += 15) {
decode_process(in + i, out + r_len);
r_len += 8;
}
return r_len;
}