Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
 All Data Structures Files Functions Variables Groups Pages
converter.c
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "common.h"
20 #include "converter.h"
21 #include "dict_group.h"
22 #include "dict_chain.h"
23 #include "encoding.h"
24 
25 #define DELIMITER ' '
26 #define SEGMENT_MAXIMUM_LENGTH 0
27 #define SEGMENT_SHORTEST_PATH 1
28 #define SEGMENT_METHOD SEGMENT_SHORTEST_PATH
29 
30 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
31 # define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
32 typedef struct {
33  int initialized;
34  size_t buffer_size;
35  size_t* match_length;
36  size_t* min_len;
37  size_t* parent;
38  size_t* path;
39 } SpsegData;
40 #endif
41 
42 static converter_error errnum = CONVERTER_ERROR_VOID;
43 
44 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
45 static void sp_seg_buffer_free(SpsegData* ossb) {
46  free(ossb->match_length);
47  free(ossb->min_len);
48  free(ossb->parent);
49  free(ossb->path);
50 }
51 
52 static void sp_seg_set_buffer_size(SpsegData* ossb, size_t buffer_size) {
53  if (ossb->initialized == 1) {
54  sp_seg_buffer_free(ossb);
55  }
56  ossb->buffer_size = buffer_size;
57  ossb->match_length = (size_t*)malloc((buffer_size + 1) * sizeof(size_t));
58  ossb->min_len = (size_t*)malloc(buffer_size * sizeof(size_t));
59  ossb->parent = (size_t*)malloc(buffer_size * sizeof(size_t));
60  ossb->path = (size_t*)malloc(buffer_size * sizeof(size_t));
61  ossb->initialized = 1;
62 }
63 
64 static size_t sp_seg(Converter* converter,
65  ucs4_t** inbuf,
66  size_t* inbuf_left,
67  ucs4_t** outbuf,
68  size_t* outbuf_left,
69  size_t length) {
70  /* 最短路徑分詞 */
71  /* 對長度爲1時特殊優化 */
72  if (length == 1) {
73  const ucs4_t* const* match_rs = dict_group_match_longest(
74  converter->current_dict_group,
75  *inbuf,
76  1,
77  NULL);
78  size_t match_len = 1;
79  if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
80  if (match_rs == NULL) {
81  **outbuf = **inbuf;
82  (*outbuf)++, (*outbuf_left)--;
83  (*inbuf)++, (*inbuf_left)--;
84  } else {
85  const ucs4_t* result = match_rs[0];
86  /* 輸出緩衝區剩餘空間小於分詞長度 */
87  if (ucs4len(result) > *outbuf_left) {
88  errnum = CONVERTER_ERROR_OUTBUF;
89  return (size_t)-1;
90  }
91  for (; *result; result++) {
92  **outbuf = *result;
93  (*outbuf)++, (*outbuf_left)--;
94  }
95  *inbuf += match_len;
96  *inbuf_left -= match_len;
97  }
98  } else if (converter->conversion_mode ==
99  OPENCC_CONVERSION_LIST_CANDIDATES) {
100  if (match_rs == NULL) {
101  **outbuf = **inbuf;
102  (*outbuf)++, (*outbuf_left)--;
103  (*inbuf)++, (*inbuf_left)--;
104  } else {
105  size_t i;
106  for (i = 0; match_rs[i] != NULL; i++) {
107  const ucs4_t* result = match_rs[i];
108  int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
109  /* 輸出緩衝區剩餘空間小於分詞長度 */
110  if (ucs4len(result) + show_delimiter > *outbuf_left) {
111  errnum = CONVERTER_ERROR_OUTBUF;
112  return (size_t)-1;
113  }
114  for (; *result; result++) {
115  **outbuf = *result;
116  (*outbuf)++, (*outbuf_left)--;
117  }
118  if (show_delimiter) {
119  **outbuf = DELIMITER;
120  (*outbuf)++, (*outbuf_left)--;
121  }
122  }
123  *inbuf += match_len;
124  *inbuf_left -= match_len;
125  }
126  } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
127  if (match_rs == NULL) {
128  **outbuf = **inbuf;
129  (*outbuf)++, (*outbuf_left)--;
130  (*inbuf)++, (*inbuf_left)--;
131  } else {
132  /* 輸出緩衝區剩餘空間小於分詞長度 */
133  if (match_len + 1 > *outbuf_left) {
134  errnum = CONVERTER_ERROR_OUTBUF;
135  return (size_t)-1;
136  }
137  size_t i;
138  for (i = 0; i < match_len; i++) {
139  **outbuf = **inbuf;
140  (*outbuf)++, (*outbuf_left)--;
141  (*inbuf)++, (*inbuf_left)--;
142  }
143  }
144  **outbuf = DELIMITER;
145  (*outbuf)++, (*outbuf_left)--;
146  } else {
147  debug_should_not_be_here();
148  }
149  /* 必須保證有一個字符空間 */
150  return match_len;
151  }
152 
153  /* 設置緩衝區空間 */
154  SpsegData* ossb = converter->data;
155  size_t buffer_size_need = length + 1;
156  if ((ossb->initialized == 0) || (ossb->buffer_size < buffer_size_need)) {
157  sp_seg_set_buffer_size(ossb, buffer_size_need);
158  }
159  size_t i, j;
160  for (i = 0; i <= length; i++) {
161  ossb->min_len[i] = INFINITY_INT;
162  }
163  ossb->min_len[0] = ossb->parent[0] = 0;
164  for (i = 0; i < length; i++) {
165  /* 獲取所有匹配長度 */
166  size_t match_count = dict_group_get_all_match_lengths(
167  converter->current_dict_group,
168  (*inbuf) + i,
169  ossb->match_length
170  );
171  if (ossb->match_length[0] != 1) {
172  ossb->match_length[match_count++] = 1;
173  }
174  /* 動態規劃求最短分割路徑 */
175  for (j = 0; j < match_count; j++) {
176  size_t k = ossb->match_length[j];
177  ossb->match_length[j] = 0;
178  if ((k > 1) && (ossb->min_len[i] + 1 <= ossb->min_len[i + k])) {
179  ossb->min_len[i + k] = ossb->min_len[i] + 1;
180  ossb->parent[i + k] = i;
181  } else if ((k == 1) &&
182  (ossb->min_len[i] + 1 < ossb->min_len[i + k])) {
183  ossb->min_len[i + k] = ossb->min_len[i] + 1;
184  ossb->parent[i + k] = i;
185  }
186  }
187  }
188  /* 取得最短分割路徑 */
189  for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) {
190  ossb->path[--j] = i;
191  }
192  size_t inbuf_left_start = *inbuf_left;
193  size_t begin, end;
194  /* 根據最短分割路徑轉換 */
195  for (i = begin = 0; i < ossb->min_len[length]; i++) {
196  end = ossb->path[i];
197  size_t match_len;
198  const ucs4_t* const* match_rs = dict_group_match_longest(
199  converter->current_dict_group,
200  *inbuf,
201  end - begin,
202  &match_len
203  );
204  if (match_rs == NULL) {
205  **outbuf = **inbuf;
206  (*outbuf)++, (*outbuf_left)--;
207  (*inbuf)++, (*inbuf_left)--;
208  } else {
209  if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
210  if (match_rs == NULL) {
211  **outbuf = **inbuf;
212  (*outbuf)++, (*outbuf_left)--;
213  (*inbuf)++, (*inbuf_left)--;
214  } else {
215  const ucs4_t* result = match_rs[0];
216  /* 輸出緩衝區剩餘空間小於分詞長度 */
217  if (ucs4len(result) > *outbuf_left) {
218  if (inbuf_left_start - *inbuf_left > 0) {
219  break;
220  }
221  errnum = CONVERTER_ERROR_OUTBUF;
222  return (size_t)-1;
223  }
224  for (; *result; result++) {
225  **outbuf = *result;
226  (*outbuf)++, (*outbuf_left)--;
227  }
228  *inbuf += match_len;
229  *inbuf_left -= match_len;
230  }
231  } else if (converter->conversion_mode ==
232  OPENCC_CONVERSION_LIST_CANDIDATES) {
233  if (match_rs == NULL) {
234  **outbuf = **inbuf;
235  (*outbuf)++, (*outbuf_left)--;
236  (*inbuf)++, (*inbuf_left)--;
237  } else {
238  size_t i;
239  for (i = 0; match_rs[i] != NULL; i++) {
240  const ucs4_t* result = match_rs[i];
241  int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
242  /* 輸出緩衝區剩餘空間小於分詞長度 */
243  if (ucs4len(result) + show_delimiter > *outbuf_left) {
244  if (inbuf_left_start - *inbuf_left > 0) {
245  break;
246  }
247  errnum = CONVERTER_ERROR_OUTBUF;
248  return (size_t)-1;
249  }
250  for (; *result; result++) {
251  **outbuf = *result;
252  (*outbuf)++, (*outbuf_left)--;
253  }
254  if (show_delimiter) {
255  **outbuf = DELIMITER;
256  (*outbuf)++, (*outbuf_left)--;
257  }
258  }
259  *inbuf += match_len;
260  *inbuf_left -= match_len;
261  }
262  } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
263  if (match_rs == NULL) {
264  **outbuf = **inbuf;
265  (*outbuf)++, (*outbuf_left)--;
266  (*inbuf)++, (*inbuf_left)--;
267  } else {
268  /* 輸出緩衝區剩餘空間小於分詞長度 */
269  if (match_len + 1 > *outbuf_left) {
270  if (inbuf_left_start - *inbuf_left > 0) {
271  break;
272  }
273  errnum = CONVERTER_ERROR_OUTBUF;
274  return (size_t)-1;
275  }
276  size_t i;
277  for (i = 0; i < match_len; i++) {
278  **outbuf = **inbuf;
279  (*outbuf)++, (*outbuf_left)--;
280  (*inbuf)++, (*inbuf_left)--;
281  }
282  }
283  **outbuf = DELIMITER;
284  (*outbuf)++, (*outbuf_left)--;
285  } else {
286  debug_should_not_be_here();
287  }
288  }
289  begin = end;
290  }
291  return inbuf_left_start - *inbuf_left;
292 }
293 
294 static size_t segment(Converter* converter,
295  ucs4_t** inbuf,
296  size_t* inbuf_left,
297  ucs4_t** outbuf,
298  size_t* outbuf_left) {
299  /* 歧義分割最短路徑分詞 */
300  size_t i, start, bound;
301  const ucs4_t* inbuf_start = *inbuf;
302  size_t inbuf_left_start = *inbuf_left;
303  size_t sp_seg_length;
304  bound = 0;
305  for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0;
306  i++) {
307  if ((i != 0) && (i == bound)) {
308  /* 對歧義部分進行最短路徑分詞 */
309  sp_seg_length = sp_seg(converter,
310  inbuf,
311  inbuf_left,
312  outbuf,
313  outbuf_left,
314  bound - start);
315 
316  if (sp_seg_length == (size_t)-1) {
317  return (size_t)-1;
318  }
319  if (sp_seg_length == 0) {
320  if (inbuf_left_start - *inbuf_left > 0) {
321  return inbuf_left_start - *inbuf_left;
322  }
323  /* 空間不足 */
324  errnum = CONVERTER_ERROR_OUTBUF;
325  return (size_t)-1;
326  }
327  start = i;
328  }
329  size_t match_len;
330  dict_group_match_longest(
331  converter->current_dict_group,
332  inbuf_start + i,
333  0,
334  &match_len
335  );
336  if (match_len == 0) {
337  match_len = 1;
338  }
339  if (i + match_len > bound) {
340  bound = i + match_len;
341  }
342  }
343  if ((*inbuf_left > 0) && (*outbuf_left > 0)) {
344  sp_seg_length = sp_seg(converter,
345  inbuf,
346  inbuf_left,
347  outbuf,
348  outbuf_left,
349  bound - start);
350  if (sp_seg_length == (size_t)-1) {
351  return (size_t)-1;
352  }
353  if (sp_seg_length == 0) {
354  if (inbuf_left_start - *inbuf_left > 0) {
355  return inbuf_left_start - *inbuf_left;
356  }
357  /* 空間不足 */
358  errnum = CONVERTER_ERROR_OUTBUF;
359  return (size_t)-1;
360  }
361  }
362  if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
363  (*outbuf)--;
364  (*outbuf_left)++;
365  }
366  return inbuf_left_start - *inbuf_left;
367 }
368 
369 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
370 
371 #if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH
372 static size_t segment(Converter* converter,
373  ucs4_t** inbuf,
374  size_t* inbuf_left,
375  ucs4_t** outbuf,
376  size_t* outbuf_left) {
377  /* 正向最大分詞 */
378  size_t inbuf_left_start = *inbuf_left;
379  for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) {
380  size_t match_len;
381  const ucs4_t* const* match_rs = dict_group_match_longest(
382  converter->current_dict_group,
383  *inbuf,
384  *inbuf_left,
385  &match_len
386  );
387  if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
388  if (match_rs == NULL) {
389  **outbuf = **inbuf;
390  (*outbuf)++, (*outbuf_left)--;
391  (*inbuf)++, (*inbuf_left)--;
392  } else {
393  const ucs4_t* result = match_rs[0];
394  /* 輸出緩衝區剩餘空間小於分詞長度 */
395  if (ucs4len(result) > *outbuf_left) {
396  if (inbuf_left_start - *inbuf_left > 0) {
397  break;
398  }
399  errnum = CONVERTER_ERROR_OUTBUF;
400  return (size_t)-1;
401  }
402  for (; *result; result++) {
403  **outbuf = *result;
404  (*outbuf)++, (*outbuf_left)--;
405  }
406  *inbuf += match_len;
407  *inbuf_left -= match_len;
408  }
409  } else if (converter->conversion_mode ==
410  OPENCC_CONVERSION_LIST_CANDIDATES) {
411  if (match_rs == NULL) {
412  **outbuf = **inbuf;
413  (*outbuf)++, (*outbuf_left)--;
414  (*inbuf)++, (*inbuf_left)--;
415  } else {
416  size_t i;
417  for (i = 0; match_rs[i] != NULL; i++) {
418  const ucs4_t* result = match_rs[i];
419  int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
420  /* 輸出緩衝區剩餘空間小於分詞長度 */
421  if (ucs4len(result) + show_delimiter > *outbuf_left) {
422  if (inbuf_left_start - *inbuf_left > 0) {
423  break;
424  }
425  errnum = CONVERTER_ERROR_OUTBUF;
426  return (size_t)-1;
427  }
428  for (; *result; result++) {
429  **outbuf = *result;
430  (*outbuf)++, (*outbuf_left)--;
431  }
432  if (show_delimiter) {
433  **outbuf = DELIMITER;
434  (*outbuf)++, (*outbuf_left)--;
435  }
436  }
437  *inbuf += match_len;
438  *inbuf_left -= match_len;
439  }
440  } else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
441  if (match_rs == NULL) {
442  **outbuf = **inbuf;
443  (*outbuf)++, (*outbuf_left)--;
444  (*inbuf)++, (*inbuf_left)--;
445  } else {
446  /* 輸出緩衝區剩餘空間小於分詞長度 */
447  if (match_len + 1 > *outbuf_left) {
448  if (inbuf_left_start - *inbuf_left > 0) {
449  break;
450  }
451  errnum = CONVERTER_ERROR_OUTBUF;
452  return (size_t)-1;
453  }
454  size_t i;
455  for (i = 0; i < match_len; i++) {
456  **outbuf = **inbuf;
457  (*outbuf)++, (*outbuf_left)--;
458  (*inbuf)++, (*inbuf_left)--;
459  }
460  }
461  **outbuf = DELIMITER;
462  (*outbuf)++, (*outbuf_left)--;
463  } else {
464  debug_should_not_be_here();
465  }
466  }
467  if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
468  (*outbuf)--;
469  (*outbuf_left)++;
470  }
471  return inbuf_left_start - *inbuf_left;
472 }
473 
474 #endif /* if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH */
475 
476 size_t converter_convert(Converter* converter,
477  ucs4_t** inbuf,
478  size_t* inbuf_left,
479  ucs4_t** outbuf,
480  size_t* outbuf_left) {
481  if (converter->dict_chain == NULL) {
482  errnum = CONVERTER_ERROR_NODICT;
483  return (size_t)-1;
484  }
485  if (converter->dict_chain->count == 1) {
486  /* 只有一個辭典,直接輸出 */
487  return segment(converter,
488  inbuf,
489  inbuf_left,
490  outbuf,
491  outbuf_left);
492  }
493  // 啓用辭典轉換鏈
494  size_t inbuf_size = *inbuf_left;
495  size_t outbuf_size = *outbuf_left;
496  size_t retval = (size_t)-1;
497  size_t cinbuf_left, coutbuf_left;
498  size_t coutbuf_delta = 0;
499  size_t i, cur;
500  ucs4_t* tmpbuf = (ucs4_t*)malloc(sizeof(ucs4_t) * outbuf_size);
501  ucs4_t* orig_outbuf = *outbuf;
502  ucs4_t* cinbuf, * coutbuf;
503  cinbuf_left = inbuf_size;
504  coutbuf_left = outbuf_size;
505  cinbuf = *inbuf;
506  coutbuf = tmpbuf;
507  for (i = cur = 0; i < converter->dict_chain->count; ++i, cur = 1 - cur) {
508  if (i > 0) {
509  cinbuf_left = coutbuf_delta;
510  coutbuf_left = outbuf_size;
511 
512  if (cur == 1) {
513  cinbuf = tmpbuf;
514  coutbuf = orig_outbuf;
515  } else {
516  cinbuf = orig_outbuf;
517  coutbuf = tmpbuf;
518  }
519  }
520  converter->current_dict_group = dict_chain_get_group(
521  converter->dict_chain,
522  i);
523  size_t ret = segment(converter,
524  &cinbuf,
525  &cinbuf_left,
526  &coutbuf,
527  &coutbuf_left);
528  if (ret == (size_t)-1) {
529  free(tmpbuf);
530  return (size_t)-1;
531  }
532  coutbuf_delta = outbuf_size - coutbuf_left;
533  if (i == 0) {
534  retval = ret;
535  *inbuf = cinbuf;
536  *inbuf_left = cinbuf_left;
537  }
538  }
539  if (cur == 1) {
540  // 結果在緩衝區
541  memcpy(*outbuf, tmpbuf, coutbuf_delta * sizeof(ucs4_t));
542  }
543  *outbuf += coutbuf_delta;
544  *outbuf_left = coutbuf_left;
545  free(tmpbuf);
546  return retval;
547 }
548 
549 void converter_assign_dictionary(Converter* converter, DictChain* dict_chain) {
550  converter->dict_chain = dict_chain;
551  if (converter->dict_chain->count > 0) {
552  converter->current_dict_group = dict_chain_get_group(
553  converter->dict_chain,
554  0);
555  }
556 }
557 
558 Converter* converter_open(void) {
559  Converter* converter = (Converter*)malloc(sizeof(Converter));
560  converter->dict_chain = NULL;
561  converter->current_dict_group = NULL;
562 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
563  converter->data = (SpsegData*)malloc(sizeof(SpsegData));
564  SpsegData* spseg_buffer = converter->data;
565  spseg_buffer->initialized = 0;
566  spseg_buffer->match_length = NULL;
567  spseg_buffer->min_len = NULL;
568  spseg_buffer->parent = NULL;
569  spseg_buffer->path = NULL;
570  sp_seg_set_buffer_size(spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
571 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
572  return converter;
573 }
574 
575 void converter_close(Converter* converter) {
576 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
577  sp_seg_buffer_free(converter->data);
578  free((SpsegData *)converter->data);
579 #endif /* if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH */
580  free(converter);
581 }
582 
583 void converter_set_conversion_mode(Converter* converter,
584  opencc_conversion_mode conversion_mode) {
585  converter->conversion_mode = conversion_mode;
586 }
587 
588 converter_error converter_errno(void) {
589  return errnum;
590 }
591 
592 void converter_perror(const char* spec) {
593  perr(spec);
594  perr("\n");
595  switch (errnum) {
596  case CONVERTER_ERROR_VOID:
597  break;
598  case CONVERTER_ERROR_NODICT:
599  perr(_("No dictionary loaded"));
600  break;
601  case CONVERTER_ERROR_OUTBUF:
602  perr(_("Output buffer not enough for one segment"));
603  break;
604  default:
605  perr(_("Unknown"));
606  }
607 }