22 #define INITIAL_BUFF_SIZE 1024
23 #define GET_BIT(byte, pos) (((byte) >> (pos))& 1)
24 #define BITMASK(length) ((1 << length) - 1)
26 ucs4_t* utf8_to_ucs4(
const char* utf8,
size_t length) {
31 for (i = 0; i < length && utf8[i] !=
'\0'; i++) {}
33 size_t freesize = INITIAL_BUFF_SIZE;
34 ucs4_t* ucs4 = (ucs4_t*)malloc(
sizeof(ucs4_t) * freesize);
36 for (i = 0; i < length; i++) {
37 ucs4_t byte[4] = { 0 };
38 if (GET_BIT(utf8[i], 7) == 0) {
41 byte[0] = utf8[i] & BITMASK(7);
42 }
else if (GET_BIT(utf8[i], 5) == 0) {
45 if (i + 1 >= length) {
48 byte[0] = (utf8[i + 1] & BITMASK(6)) +
49 ((utf8[i] & BITMASK(2)) << 6);
50 byte[1] = (utf8[i] >> 2) & BITMASK(3);
52 }
else if (GET_BIT(utf8[i], 4) == 0) {
55 if (i + 2 >= length) {
58 byte[0] = (utf8[i + 2] & BITMASK(6)) +
59 ((utf8[i + 1] & BITMASK(2)) << 6);
60 byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4))
61 + ((utf8[i] & BITMASK(4)) << 4);
63 }
else if (GET_BIT(utf8[i], 3) == 0) {
66 if (i + 3 >= length) {
69 byte[0] = (utf8[i + 3] & BITMASK(6)) +
70 ((utf8[i + 2] & BITMASK(2)) << 6);
71 byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) +
72 ((utf8[i + 1] & BITMASK(4)) << 4);
73 byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) +
74 ((utf8[i] & BITMASK(3)) << 2);
76 }
else if (GET_BIT(utf8[i], 2) == 0) {
79 if (i + 4 >= length) {
82 byte[0] = (utf8[i + 4] & BITMASK(6)) +
83 ((utf8[i + 3] & BITMASK(2)) << 6);
84 byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) +
85 ((utf8[i + 2] & BITMASK(4)) << 4);
86 byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) +
87 ((utf8[i + 1] & BITMASK(6)) << 2);
88 byte[3] = utf8[i] & BITMASK(2);
90 }
else if (GET_BIT(utf8[i], 1) == 0) {
93 if (i + 5 >= length) {
96 byte[0] = (utf8[i + 5] & BITMASK(6)) +
97 ((utf8[i + 4] & BITMASK(2)) << 6);
98 byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) +
99 ((utf8[i + 3] & BITMASK(4)) << 4);
100 byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) +
101 ((utf8[i + 2] & BITMASK(6)) << 2);
102 byte[3] = (utf8[i + 1] & BITMASK(6)) +
103 ((utf8[i] & BITMASK(1)) << 6);
109 freesize = pucs4 - ucs4;
110 ucs4 = (ucs4_t*)realloc(ucs4,
sizeof(ucs4_t) * (freesize + freesize));
111 pucs4 = ucs4 + freesize;
113 *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0];
117 length = (pucs4 - ucs4 + 1);
118 ucs4 = (ucs4_t*)realloc(ucs4,
sizeof(ucs4_t) * length);
119 ucs4[length - 1] = 0;
127 char* ucs4_to_utf8(
const ucs4_t* ucs4,
size_t length) {
132 for (i = 0; i < length && ucs4[i] != 0; i++) {}
134 size_t freesize = INITIAL_BUFF_SIZE;
135 char* utf8 = (
char*)malloc(
sizeof(
char) * freesize);
137 for (i = 0; i < length; i++) {
138 if ((ssize_t)freesize - 6 <= 0) {
139 freesize = putf8 - utf8;
140 utf8 = (
char*)realloc(utf8,
sizeof(
char) * (freesize + freesize));
141 putf8 = utf8 + freesize;
145 (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8),
146 (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8)
152 putf8[0] = byte[0] & BITMASK(7);
154 }
else if (c <= 0x7FF) {
157 putf8[1] = 0x80 + (byte[0] & BITMASK(6));
158 putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) +
159 ((byte[1] & BITMASK(3)) << 2);
161 }
else if (c <= 0xFFFF) {
164 putf8[2] = 0x80 + (byte[0] & BITMASK(6));
165 putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
166 ((byte[1] & BITMASK(4)) << 2);
167 putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4));
169 }
else if (c <= 0x1FFFFF) {
172 putf8[3] = 0x80 + (byte[0] & BITMASK(6));
173 putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
174 ((byte[1] & BITMASK(4)) << 2);
175 putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
176 ((byte[2] & BITMASK(2)) << 4);
177 putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3));
179 }
else if (c <= 0x3FFFFFF) {
182 putf8[4] = 0x80 + (byte[0] & BITMASK(6));
183 putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
184 ((byte[1] & BITMASK(4)) << 2);
185 putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
186 ((byte[2] & BITMASK(2)) << 4);
187 putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
188 putf8[0] = 0xF8 + (byte[3] & BITMASK(2));
190 }
else if (c <= 0x7FFFFFFF) {
193 putf8[5] = 0x80 + (byte[0] & BITMASK(6));
194 putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
195 ((byte[1] & BITMASK(4)) << 2);
196 putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
197 ((byte[2] & BITMASK(2)) << 4);
198 putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
199 putf8[1] = 0x80 + (byte[3] & BITMASK(6));
200 putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1));
209 length = (putf8 - utf8 + 1);
210 utf8 = (
char*)realloc(utf8,
sizeof(
char) * length);
211 utf8[length - 1] =
'\0';
215 size_t ucs4len(
const ucs4_t* str) {
216 const register ucs4_t* pstr = str;
223 int ucs4cmp(
const ucs4_t* src,
const ucs4_t* dst) {
224 register int ret = 0;
225 while (!(ret = *src - *dst) && *dst) {
231 void ucs4cpy(ucs4_t* dest,
const ucs4_t* src) {
238 void ucs4ncpy(ucs4_t* dest,
const ucs4_t* src,
size_t len) {
239 while (*src && len-- > 0) {