20 #include "converter.h"
21 #include "dict_group.h"
22 #include "dict_chain.h"
26 #define SEGMENT_MAXIMUM_LENGTH 0
27 #define SEGMENT_SHORTEST_PATH 1
28 #define SEGMENT_METHOD SEGMENT_SHORTEST_PATH
30 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
31 # define OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE 1024
42 static converter_error errnum = CONVERTER_ERROR_VOID;
44 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
45 static void sp_seg_buffer_free(
SpsegData* ossb) {
46 free(ossb->match_length);
52 static void sp_seg_set_buffer_size(
SpsegData* ossb,
size_t buffer_size) {
53 if (ossb->initialized == 1) {
54 sp_seg_buffer_free(ossb);
56 ossb->buffer_size = buffer_size;
57 ossb->match_length = (
size_t*)malloc((buffer_size + 1) *
sizeof(size_t));
58 ossb->min_len = (
size_t*)malloc(buffer_size *
sizeof(
size_t));
59 ossb->parent = (
size_t*)malloc(buffer_size *
sizeof(
size_t));
60 ossb->path = (
size_t*)malloc(buffer_size *
sizeof(
size_t));
61 ossb->initialized = 1;
64 static size_t sp_seg(
Converter* converter,
73 const ucs4_t*
const* match_rs = dict_group_match_longest(
74 converter->current_dict_group,
79 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
80 if (match_rs == NULL) {
82 (*outbuf)++, (*outbuf_left)--;
83 (*inbuf)++, (*inbuf_left)--;
85 const ucs4_t* result = match_rs[0];
87 if (ucs4len(result) > *outbuf_left) {
88 errnum = CONVERTER_ERROR_OUTBUF;
91 for (; *result; result++) {
93 (*outbuf)++, (*outbuf_left)--;
96 *inbuf_left -= match_len;
98 }
else if (converter->conversion_mode ==
99 OPENCC_CONVERSION_LIST_CANDIDATES) {
100 if (match_rs == NULL) {
102 (*outbuf)++, (*outbuf_left)--;
103 (*inbuf)++, (*inbuf_left)--;
106 for (i = 0; match_rs[i] != NULL; i++) {
107 const ucs4_t* result = match_rs[i];
108 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
110 if (ucs4len(result) + show_delimiter > *outbuf_left) {
111 errnum = CONVERTER_ERROR_OUTBUF;
114 for (; *result; result++) {
116 (*outbuf)++, (*outbuf_left)--;
118 if (show_delimiter) {
119 **outbuf = DELIMITER;
120 (*outbuf)++, (*outbuf_left)--;
124 *inbuf_left -= match_len;
126 }
else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
127 if (match_rs == NULL) {
129 (*outbuf)++, (*outbuf_left)--;
130 (*inbuf)++, (*inbuf_left)--;
133 if (match_len + 1 > *outbuf_left) {
134 errnum = CONVERTER_ERROR_OUTBUF;
138 for (i = 0; i < match_len; i++) {
140 (*outbuf)++, (*outbuf_left)--;
141 (*inbuf)++, (*inbuf_left)--;
144 **outbuf = DELIMITER;
145 (*outbuf)++, (*outbuf_left)--;
147 debug_should_not_be_here();
155 size_t buffer_size_need = length + 1;
156 if ((ossb->initialized == 0) || (ossb->buffer_size < buffer_size_need)) {
157 sp_seg_set_buffer_size(ossb, buffer_size_need);
160 for (i = 0; i <= length; i++) {
161 ossb->min_len[i] = INFINITY_INT;
163 ossb->min_len[0] = ossb->parent[0] = 0;
164 for (i = 0; i < length; i++) {
166 size_t match_count = dict_group_get_all_match_lengths(
167 converter->current_dict_group,
171 if (ossb->match_length[0] != 1) {
172 ossb->match_length[match_count++] = 1;
175 for (j = 0; j < match_count; j++) {
176 size_t k = ossb->match_length[j];
177 ossb->match_length[j] = 0;
178 if ((k > 1) && (ossb->min_len[i] + 1 <= ossb->min_len[i + k])) {
179 ossb->min_len[i + k] = ossb->min_len[i] + 1;
180 ossb->parent[i + k] = i;
181 }
else if ((k == 1) &&
182 (ossb->min_len[i] + 1 < ossb->min_len[i + k])) {
183 ossb->min_len[i + k] = ossb->min_len[i] + 1;
184 ossb->parent[i + k] = i;
189 for (i = length, j = ossb->min_len[length]; i != 0; i = ossb->parent[i]) {
192 size_t inbuf_left_start = *inbuf_left;
195 for (i = begin = 0; i < ossb->min_len[length]; i++) {
198 const ucs4_t*
const* match_rs = dict_group_match_longest(
199 converter->current_dict_group,
204 if (match_rs == NULL) {
206 (*outbuf)++, (*outbuf_left)--;
207 (*inbuf)++, (*inbuf_left)--;
209 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
210 if (match_rs == NULL) {
212 (*outbuf)++, (*outbuf_left)--;
213 (*inbuf)++, (*inbuf_left)--;
215 const ucs4_t* result = match_rs[0];
217 if (ucs4len(result) > *outbuf_left) {
218 if (inbuf_left_start - *inbuf_left > 0) {
221 errnum = CONVERTER_ERROR_OUTBUF;
224 for (; *result; result++) {
226 (*outbuf)++, (*outbuf_left)--;
229 *inbuf_left -= match_len;
231 }
else if (converter->conversion_mode ==
232 OPENCC_CONVERSION_LIST_CANDIDATES) {
233 if (match_rs == NULL) {
235 (*outbuf)++, (*outbuf_left)--;
236 (*inbuf)++, (*inbuf_left)--;
239 for (i = 0; match_rs[i] != NULL; i++) {
240 const ucs4_t* result = match_rs[i];
241 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
243 if (ucs4len(result) + show_delimiter > *outbuf_left) {
244 if (inbuf_left_start - *inbuf_left > 0) {
247 errnum = CONVERTER_ERROR_OUTBUF;
250 for (; *result; result++) {
252 (*outbuf)++, (*outbuf_left)--;
254 if (show_delimiter) {
255 **outbuf = DELIMITER;
256 (*outbuf)++, (*outbuf_left)--;
260 *inbuf_left -= match_len;
262 }
else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
263 if (match_rs == NULL) {
265 (*outbuf)++, (*outbuf_left)--;
266 (*inbuf)++, (*inbuf_left)--;
269 if (match_len + 1 > *outbuf_left) {
270 if (inbuf_left_start - *inbuf_left > 0) {
273 errnum = CONVERTER_ERROR_OUTBUF;
277 for (i = 0; i < match_len; i++) {
279 (*outbuf)++, (*outbuf_left)--;
280 (*inbuf)++, (*inbuf_left)--;
283 **outbuf = DELIMITER;
284 (*outbuf)++, (*outbuf_left)--;
286 debug_should_not_be_here();
291 return inbuf_left_start - *inbuf_left;
294 static size_t segment(
Converter* converter,
298 size_t* outbuf_left) {
300 size_t i, start, bound;
301 const ucs4_t* inbuf_start = *inbuf;
302 size_t inbuf_left_start = *inbuf_left;
303 size_t sp_seg_length;
305 for (i = start = 0; inbuf_start[i] && *inbuf_left > 0 && *outbuf_left > 0;
307 if ((i != 0) && (i == bound)) {
309 sp_seg_length = sp_seg(converter,
316 if (sp_seg_length == (
size_t)-1) {
319 if (sp_seg_length == 0) {
320 if (inbuf_left_start - *inbuf_left > 0) {
321 return inbuf_left_start - *inbuf_left;
324 errnum = CONVERTER_ERROR_OUTBUF;
330 dict_group_match_longest(
331 converter->current_dict_group,
336 if (match_len == 0) {
339 if (i + match_len > bound) {
340 bound = i + match_len;
343 if ((*inbuf_left > 0) && (*outbuf_left > 0)) {
344 sp_seg_length = sp_seg(converter,
350 if (sp_seg_length == (
size_t)-1) {
353 if (sp_seg_length == 0) {
354 if (inbuf_left_start - *inbuf_left > 0) {
355 return inbuf_left_start - *inbuf_left;
358 errnum = CONVERTER_ERROR_OUTBUF;
362 if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
366 return inbuf_left_start - *inbuf_left;
371 #if SEGMENT_METHOD == SEGMENT_MAXIMUM_LENGTH
372 static size_t segment(
Converter* converter,
376 size_t* outbuf_left) {
378 size_t inbuf_left_start = *inbuf_left;
379 for (; **inbuf && *inbuf_left > 0 && *outbuf_left > 0;) {
381 const ucs4_t*
const* match_rs = dict_group_match_longest(
382 converter->current_dict_group,
387 if (converter->conversion_mode == OPENCC_CONVERSION_FAST) {
388 if (match_rs == NULL) {
390 (*outbuf)++, (*outbuf_left)--;
391 (*inbuf)++, (*inbuf_left)--;
393 const ucs4_t* result = match_rs[0];
395 if (ucs4len(result) > *outbuf_left) {
396 if (inbuf_left_start - *inbuf_left > 0) {
399 errnum = CONVERTER_ERROR_OUTBUF;
402 for (; *result; result++) {
404 (*outbuf)++, (*outbuf_left)--;
407 *inbuf_left -= match_len;
409 }
else if (converter->conversion_mode ==
410 OPENCC_CONVERSION_LIST_CANDIDATES) {
411 if (match_rs == NULL) {
413 (*outbuf)++, (*outbuf_left)--;
414 (*inbuf)++, (*inbuf_left)--;
417 for (i = 0; match_rs[i] != NULL; i++) {
418 const ucs4_t* result = match_rs[i];
419 int show_delimiter = match_rs[i + 1] != NULL ? 1 : 0;
421 if (ucs4len(result) + show_delimiter > *outbuf_left) {
422 if (inbuf_left_start - *inbuf_left > 0) {
425 errnum = CONVERTER_ERROR_OUTBUF;
428 for (; *result; result++) {
430 (*outbuf)++, (*outbuf_left)--;
432 if (show_delimiter) {
433 **outbuf = DELIMITER;
434 (*outbuf)++, (*outbuf_left)--;
438 *inbuf_left -= match_len;
440 }
else if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
441 if (match_rs == NULL) {
443 (*outbuf)++, (*outbuf_left)--;
444 (*inbuf)++, (*inbuf_left)--;
447 if (match_len + 1 > *outbuf_left) {
448 if (inbuf_left_start - *inbuf_left > 0) {
451 errnum = CONVERTER_ERROR_OUTBUF;
455 for (i = 0; i < match_len; i++) {
457 (*outbuf)++, (*outbuf_left)--;
458 (*inbuf)++, (*inbuf_left)--;
461 **outbuf = DELIMITER;
462 (*outbuf)++, (*outbuf_left)--;
464 debug_should_not_be_here();
467 if (converter->conversion_mode == OPENCC_CONVERSION_SEGMENT_ONLY) {
471 return inbuf_left_start - *inbuf_left;
476 size_t converter_convert(
Converter* converter,
480 size_t* outbuf_left) {
481 if (converter->dict_chain == NULL) {
482 errnum = CONVERTER_ERROR_NODICT;
485 if (converter->dict_chain->count == 1) {
487 return segment(converter,
494 size_t inbuf_size = *inbuf_left;
495 size_t outbuf_size = *outbuf_left;
496 size_t retval = (size_t)-1;
497 size_t cinbuf_left, coutbuf_left;
498 size_t coutbuf_delta = 0;
500 ucs4_t* tmpbuf = (ucs4_t*)malloc(
sizeof(ucs4_t) * outbuf_size);
501 ucs4_t* orig_outbuf = *outbuf;
502 ucs4_t* cinbuf, * coutbuf;
503 cinbuf_left = inbuf_size;
504 coutbuf_left = outbuf_size;
507 for (i = cur = 0; i < converter->dict_chain->count; ++i, cur = 1 - cur) {
509 cinbuf_left = coutbuf_delta;
510 coutbuf_left = outbuf_size;
514 coutbuf = orig_outbuf;
516 cinbuf = orig_outbuf;
520 converter->current_dict_group = dict_chain_get_group(
521 converter->dict_chain,
523 size_t ret = segment(converter,
528 if (ret == (
size_t)-1) {
532 coutbuf_delta = outbuf_size - coutbuf_left;
536 *inbuf_left = cinbuf_left;
541 memcpy(*outbuf, tmpbuf, coutbuf_delta *
sizeof(ucs4_t));
543 *outbuf += coutbuf_delta;
544 *outbuf_left = coutbuf_left;
550 converter->dict_chain = dict_chain;
551 if (converter->dict_chain->count > 0) {
552 converter->current_dict_group = dict_chain_get_group(
553 converter->dict_chain,
560 converter->dict_chain = NULL;
561 converter->current_dict_group = NULL;
562 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
564 SpsegData* spseg_buffer = converter->data;
565 spseg_buffer->initialized = 0;
566 spseg_buffer->match_length = NULL;
567 spseg_buffer->min_len = NULL;
568 spseg_buffer->parent = NULL;
569 spseg_buffer->path = NULL;
570 sp_seg_set_buffer_size(spseg_buffer, OPENCC_SP_SEG_DEFAULT_BUFFER_SIZE);
575 void converter_close(
Converter* converter) {
576 #if SEGMENT_METHOD == SEGMENT_SHORTEST_PATH
577 sp_seg_buffer_free(converter->data);
583 void converter_set_conversion_mode(
Converter* converter,
584 opencc_conversion_mode conversion_mode) {
585 converter->conversion_mode = conversion_mode;
588 converter_error converter_errno(
void) {
592 void converter_perror(
const char* spec) {
596 case CONVERTER_ERROR_VOID:
598 case CONVERTER_ERROR_NODICT:
599 perr(_(
"No dictionary loaded"));
601 case CONVERTER_ERROR_OUTBUF:
602 perr(_(
"Output buffer not enough for one segment"));