Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
 All Data Structures Files Functions Variables Groups Pages
encoding.c
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "encoding.h"
20 #include "opencc.h"
21 
22 #define INITIAL_BUFF_SIZE 1024
23 #define GET_BIT(byte, pos) (((byte) >> (pos))& 1)
24 #define BITMASK(length) ((1 << length) - 1)
25 
26 ucs4_t* utf8_to_ucs4(const char* utf8, size_t length) {
27  if (length == 0) {
28  length = (size_t)-1;
29  }
30  size_t i;
31  for (i = 0; i < length && utf8[i] != '\0'; i++) {}
32  length = i;
33  size_t freesize = INITIAL_BUFF_SIZE;
34  ucs4_t* ucs4 = (ucs4_t*)malloc(sizeof(ucs4_t) * freesize);
35  ucs4_t* pucs4 = ucs4;
36  for (i = 0; i < length; i++) {
37  ucs4_t byte[4] = { 0 };
38  if (GET_BIT(utf8[i], 7) == 0) {
39  /* U-00000000 - U-0000007F */
40  /* 0xxxxxxx */
41  byte[0] = utf8[i] & BITMASK(7);
42  } else if (GET_BIT(utf8[i], 5) == 0) {
43  /* U-00000080 - U-000007FF */
44  /* 110xxxxx 10xxxxxx */
45  if (i + 1 >= length) {
46  goto err;
47  }
48  byte[0] = (utf8[i + 1] & BITMASK(6)) +
49  ((utf8[i] & BITMASK(2)) << 6);
50  byte[1] = (utf8[i] >> 2) & BITMASK(3);
51  i += 1;
52  } else if (GET_BIT(utf8[i], 4) == 0) {
53  /* U-00000800 - U-0000FFFF */
54  /* 1110xxxx 10xxxxxx 10xxxxxx */
55  if (i + 2 >= length) {
56  goto err;
57  }
58  byte[0] = (utf8[i + 2] & BITMASK(6)) +
59  ((utf8[i + 1] & BITMASK(2)) << 6);
60  byte[1] = ((utf8[i + 1] >> 2) & BITMASK(4))
61  + ((utf8[i] & BITMASK(4)) << 4);
62  i += 2;
63  } else if (GET_BIT(utf8[i], 3) == 0) {
64  /* U-00010000 - U-001FFFFF */
65  /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
66  if (i + 3 >= length) {
67  goto err;
68  }
69  byte[0] = (utf8[i + 3] & BITMASK(6)) +
70  ((utf8[i + 2] & BITMASK(2)) << 6);
71  byte[1] = ((utf8[i + 2] >> 2) & BITMASK(4)) +
72  ((utf8[i + 1] & BITMASK(4)) << 4);
73  byte[2] = ((utf8[i + 1] >> 4) & BITMASK(2)) +
74  ((utf8[i] & BITMASK(3)) << 2);
75  i += 3;
76  } else if (GET_BIT(utf8[i], 2) == 0) {
77  /* U-00200000 - U-03FFFFFF */
78  /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
79  if (i + 4 >= length) {
80  goto err;
81  }
82  byte[0] = (utf8[i + 4] & BITMASK(6)) +
83  ((utf8[i + 3] & BITMASK(2)) << 6);
84  byte[1] = ((utf8[i + 3] >> 2) & BITMASK(4)) +
85  ((utf8[i + 2] & BITMASK(4)) << 4);
86  byte[2] = ((utf8[i + 2] >> 4) & BITMASK(2)) +
87  ((utf8[i + 1] & BITMASK(6)) << 2);
88  byte[3] = utf8[i] & BITMASK(2);
89  i += 4;
90  } else if (GET_BIT(utf8[i], 1) == 0) {
91  /* U-04000000 - U-7FFFFFFF */
92  /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
93  if (i + 5 >= length) {
94  goto err;
95  }
96  byte[0] = (utf8[i + 5] & BITMASK(6)) +
97  ((utf8[i + 4] & BITMASK(2)) << 6);
98  byte[1] = ((utf8[i + 4] >> 2) & BITMASK(4)) +
99  ((utf8[i + 3] & BITMASK(4)) << 4);
100  byte[2] = ((utf8[i + 3] >> 4) & BITMASK(2)) +
101  ((utf8[i + 2] & BITMASK(6)) << 2);
102  byte[3] = (utf8[i + 1] & BITMASK(6)) +
103  ((utf8[i] & BITMASK(1)) << 6);
104  i += 5;
105  } else {
106  goto err;
107  }
108  if (freesize == 0) {
109  freesize = pucs4 - ucs4;
110  ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * (freesize + freesize));
111  pucs4 = ucs4 + freesize;
112  }
113  *pucs4 = (byte[3] << 24) + (byte[2] << 16) + (byte[1] << 8) + byte[0];
114  pucs4++;
115  freesize--;
116  }
117  length = (pucs4 - ucs4 + 1);
118  ucs4 = (ucs4_t*)realloc(ucs4, sizeof(ucs4_t) * length);
119  ucs4[length - 1] = 0;
120  return ucs4;
121 
122 err:
123  free(ucs4);
124  return (ucs4_t*)-1;
125 }
126 
127 char* ucs4_to_utf8(const ucs4_t* ucs4, size_t length) {
128  if (length == 0) {
129  length = (size_t)-1;
130  }
131  size_t i;
132  for (i = 0; i < length && ucs4[i] != 0; i++) {}
133  length = i;
134  size_t freesize = INITIAL_BUFF_SIZE;
135  char* utf8 = (char*)malloc(sizeof(char) * freesize);
136  char* putf8 = utf8;
137  for (i = 0; i < length; i++) {
138  if ((ssize_t)freesize - 6 <= 0) {
139  freesize = putf8 - utf8;
140  utf8 = (char*)realloc(utf8, sizeof(char) * (freesize + freesize));
141  putf8 = utf8 + freesize;
142  }
143  ucs4_t c = ucs4[i];
144  ucs4_t byte[4] = {
145  (c >> 0) & BITMASK(8), (c >> 8) & BITMASK(8),
146  (c >> 16) & BITMASK(8), (c >> 24) & BITMASK(8)
147  };
148  size_t delta = 0;
149  if (c <= 0x7F) {
150  /* U-00000000 - U-0000007F */
151  /* 0xxxxxxx */
152  putf8[0] = byte[0] & BITMASK(7);
153  delta = 1;
154  } else if (c <= 0x7FF) {
155  /* U-00000080 - U-000007FF */
156  /* 110xxxxx 10xxxxxx */
157  putf8[1] = 0x80 + (byte[0] & BITMASK(6));
158  putf8[0] = 0xC0 + ((byte[0] >> 6) & BITMASK(2)) +
159  ((byte[1] & BITMASK(3)) << 2);
160  delta = 2;
161  } else if (c <= 0xFFFF) {
162  /* U-00000800 - U-0000FFFF */
163  /* 1110xxxx 10xxxxxx 10xxxxxx */
164  putf8[2] = 0x80 + (byte[0] & BITMASK(6));
165  putf8[1] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
166  ((byte[1] & BITMASK(4)) << 2);
167  putf8[0] = 0xE0 + ((byte[1] >> 4) & BITMASK(4));
168  delta = 3;
169  } else if (c <= 0x1FFFFF) {
170  /* U-00010000 - U-001FFFFF */
171  /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
172  putf8[3] = 0x80 + (byte[0] & BITMASK(6));
173  putf8[2] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
174  ((byte[1] & BITMASK(4)) << 2);
175  putf8[1] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
176  ((byte[2] & BITMASK(2)) << 4);
177  putf8[0] = 0xF0 + ((byte[2] >> 2) & BITMASK(3));
178  delta = 4;
179  } else if (c <= 0x3FFFFFF) {
180  /* U-00200000 - U-03FFFFFF */
181  /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
182  putf8[4] = 0x80 + (byte[0] & BITMASK(6));
183  putf8[3] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
184  ((byte[1] & BITMASK(4)) << 2);
185  putf8[2] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
186  ((byte[2] & BITMASK(2)) << 4);
187  putf8[1] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
188  putf8[0] = 0xF8 + (byte[3] & BITMASK(2));
189  delta = 5;
190  } else if (c <= 0x7FFFFFFF) {
191  /* U-04000000 - U-7FFFFFFF */
192  /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
193  putf8[5] = 0x80 + (byte[0] & BITMASK(6));
194  putf8[4] = 0x80 + ((byte[0] >> 6) & BITMASK(2)) +
195  ((byte[1] & BITMASK(4)) << 2);
196  putf8[3] = 0x80 + ((byte[1] >> 4) & BITMASK(4)) +
197  ((byte[2] & BITMASK(2)) << 4);
198  putf8[2] = 0x80 + ((byte[2] >> 2) & BITMASK(6));
199  putf8[1] = 0x80 + (byte[3] & BITMASK(6));
200  putf8[0] = 0xFC + ((byte[3] >> 6) & BITMASK(1));
201  delta = 6;
202  } else {
203  free(utf8);
204  return (char*)-1;
205  }
206  putf8 += delta;
207  freesize -= delta;
208  }
209  length = (putf8 - utf8 + 1);
210  utf8 = (char*)realloc(utf8, sizeof(char) * length);
211  utf8[length - 1] = '\0';
212  return utf8;
213 }
214 
215 size_t ucs4len(const ucs4_t* str) {
216  const register ucs4_t* pstr = str;
217  while (*pstr) {
218  ++pstr;
219  }
220  return pstr - str;
221 }
222 
223 int ucs4cmp(const ucs4_t* src, const ucs4_t* dst) {
224  register int ret = 0;
225  while (!(ret = *src - *dst) && *dst) {
226  ++src, ++dst;
227  }
228  return ret;
229 }
230 
231 void ucs4cpy(ucs4_t* dest, const ucs4_t* src) {
232  while (*src) {
233  *dest++ = *src++;
234  }
235  *dest = 0;
236 }
237 
238 void ucs4ncpy(ucs4_t* dest, const ucs4_t* src, size_t len) {
239  while (*src && len-- > 0) {
240  *dest++ = *src++;
241  }
242 }