19 #include "../dictionary/datrie.h"
20 #include "../dictionary/text.h"
21 #include "../dict_group.h"
22 #include "../encoding.h"
31 #define DATRIE_SIZE 1000000
32 #define DATRIE_WORD_MAX_COUNT 500000
33 #define DATRIE_WORD_MAX_LENGTH 32
34 #define BUFFER_SIZE 1024
48 Entry lexicon[DATRIE_WORD_MAX_COUNT];
49 uint32_t lexicon_count, words_set_count;
50 int words_set[DATRIE_WORD_MAX_COUNT];
51 ucs4_t words_set_char[DATRIE_WORD_MAX_COUNT];
53 uint32_t lexicon_index_length, lexicon_cursor_end;
62 word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED;
64 int k = encode_char(word[p]);
66 if ((j < 0) || (j > DATRIE_SIZE) || (dat[j].parent != i)) {
80 if ((i >= 0) && (i < DATRIE_SIZE)) {
81 return dat[i].parent == DATRIE_UNUSED;
86 int is_prefix(
const ucs4_t* a,
const ucs4_t* b) {
87 const ucs4_t* p = a, * q = b;
101 int binary_search(
const ucs4_t* str) {
102 int a = 0, b = lexicon_count - 1, c;
106 if (ucs4cmp(str, lexicon[c].key) <= 0) {
114 ((a == 0) || !is_prefix(str, lexicon[a - 1].key))) {
117 if (is_prefix(str, lexicon[b].key) && !is_prefix(str, lexicon[b - 1].key)) {
123 int wcmp(
const void* a,
const void* b) {
124 return *(
const ucs4_t*)a < *(
const ucs4_t*)b ? -1 : 1;
127 void get_words_with_prefix(ucs4_t* word,
int p) {
129 static ucs4_t buff[DATRIE_WORD_MAX_LENGTH];
130 static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT];
132 for (i = 0; i < p; i++) {
137 for (i = binary_search(buff);
138 (uint32_t)i < lexicon_count && is_prefix(buff, lexicon[i].key); i++) {
139 if (ucs4cmp(buff, lexicon[i].key) == 0) {
142 words_set_char_buff[words_set_count] = lexicon[i].key[p];
143 words_set[words_set_count++] = i;
145 words_set_char_buff[words_set_count] = 0;
146 qsort(words_set_char_buff, words_set_count,
sizeof(words_set_char_buff[0]),
148 ucs4_t* wfp, * wp, last;
149 for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp++) {
159 int words_space_available(
int delta) {
161 for (wp = words_set_char; *wp; wp++) {
162 if (!unused(encode_char(*wp) + delta)) {
169 void insert_first_char(
int id) {
170 Entry* word = lexicon + id;
171 int key = encode_char(word->key[0]);
172 dat[key].base = DATRIE_UNUSED;
174 if (word->length == 1) {
175 dat[key].word = (id);
179 void insert_words(
int delta,
int parent,
size_t word_len) {
181 for (i = 0; (uint32_t)i < words_set_count; i++) {
182 int j = words_set[i];
183 int k = encode_char(lexicon[j].key[word_len]) + delta;
184 dat[k].parent = parent;
185 if (lexicon[j].length == word_len + 1) {
191 void insert(
int id) {
192 static int space_min = 0;
193 Entry* word = &lexicon[id];
196 match_word(dat, word->key, &p, &i, 0);
197 if ((
size_t)p == word->length) {
200 get_words_with_prefix(word->key, p);
202 delta = space_min - words_set_char[0];
203 for (; delta < DATRIE_SIZE; delta++) {
204 if (words_space_available(delta)) {
208 if (delta == DATRIE_SIZE) {
209 fprintf(stderr,
"DATRIE_SIZE Not Enough!\n");
212 insert_words(delta, i, p);
214 while (!unused(space_min)) {
222 for (i = 1; i < DATRIE_SIZE; i++) {
223 dat[i].parent = dat[i].base = DATRIE_UNUSED;
226 dat[0].parent = dat[0].base = 0;
227 for (i = 0; i < lexicon_count; i++) {
228 insert_first_char(i);
230 for (i = 0; i < lexicon_count; i++) {
235 int cmp(
const void* a,
const void* b) {
239 void init(
const char* filename) {
241 if (dict_group_load(DictGroup, filename,
242 OPENCC_DICTIONARY_TYPE_TEXT) == -1) {
243 dictionary_perror(
"Dictionary loading error");
244 fprintf(stderr, _(
"\n"));
247 Dict* dict_abs = dict_group_get_dict(DictGroup, 0);
248 if (dict_abs == (
Dict*)-1) {
249 dictionary_perror(
"Dictionary loading error");
250 fprintf(stderr, _(
"\n"));
253 static TextEntry tlexicon[DATRIE_WORD_MAX_COUNT];
255 Dict* dictionary = dict_abs->dict;
256 lexicon_count = dict_text_get_lexicon(dictionary, tlexicon);
257 qsort(tlexicon, lexicon_count,
sizeof(tlexicon[0]), cmp);
259 size_t lexicon_cursor = 0;
260 for (i = 0; i < lexicon_count; i++) {
261 lexicon[i].key = tlexicon[i].key;
262 lexicon[i].length = ucs4len(lexicon[i].key);
264 for (j = 0; tlexicon[i].value[j] != NULL; j++) {}
265 lexicon[i].value_count = j;
266 lexicon_index_length += lexicon[i].value_count + 1;
267 lexicon[i].value = (
Value*)malloc(lexicon[i].value_count *
sizeof(
Value));
268 for (j = 0; j < lexicon[i].value_count; j++) {
269 lexicon[i].value[j].cursor = lexicon_cursor;
270 lexicon[i].value[j].pointer = tlexicon[i].value[j];
271 lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1;
274 lexicon_cursor_end = lexicon_cursor;
277 void output(
const char* file_name) {
278 FILE* fp = fopen(file_name,
"wb");
280 fprintf(stderr, _(
"Can not write file: %s\n"), file_name);
283 uint32_t i, item_count;
284 for (i = DATRIE_SIZE - 1; i > 0; i--) {
285 if (dat[i].parent != DATRIE_UNUSED) {
290 fwrite(
"OPENCCDATRIE",
sizeof(
char), strlen(
"OPENCCDATRIE"), fp);
292 fwrite(&lexicon_cursor_end,
sizeof(uint32_t), 1, fp);
293 for (i = 0; i < lexicon_count; i++) {
295 for (j = 0; j < lexicon[i].value_count; j++) {
296 fwrite(lexicon[i].value[j].pointer,
sizeof(ucs4_t),
297 ucs4len(lexicon[i].value[j].pointer) + 1, fp);
301 fwrite(&lexicon_index_length,
sizeof(uint32_t), 1, fp);
302 for (i = 0; i < lexicon_count; i++) {
304 for (j = 0; j < lexicon[i].value_count; j++) {
305 fwrite(&lexicon[i].value[j].cursor,
sizeof(uint32_t), 1, fp);
307 uint32_t dem = (uint32_t)-1;
308 fwrite(&dem,
sizeof(uint32_t), 1, fp);
310 fwrite(&lexicon_count,
sizeof(uint32_t), 1, fp);
311 fwrite(&item_count,
sizeof(uint32_t), 1, fp);
312 fwrite(dat,
sizeof(dat[0]), item_count, fp);
316 #ifdef DEBUG_WRITE_TEXT
317 void write_text_file() {
320 fp = fopen(
"datrie.txt",
"w");
321 fprintf(fp,
"%d\n", lexicon_count);
322 for (i = 0; i < lexicon_count; i++) {
323 char* buff = ucs4_to_utf8(lexicon[i].value, (
size_t)-1);
324 fprintf(fp,
"%s\n", buff);
327 for (i = 0; i < DATRIE_SIZE; i++) {
328 if (dat[i].parent != DATRIE_UNUSED) {
329 fprintf(fp,
"%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word);
337 void show_version() {
338 printf(_(
"\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"),
344 printf(_(
"Usage:\n"));
345 printf(_(
" opencc_dict -i input_file -o output_file\n\n"));
346 printf(_(
" -i input_file\n"));
347 printf(_(
" Read data from input_file.\n"));
348 printf(_(
" -o output_file\n"));
349 printf(_(
" Write converted data to output_file.\n"));
354 int main(
int argc,
char** argv) {
356 static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE];
357 int input_file_specified = 0, output_file_specified = 0;
359 #ifdef ENABLE_GETTEXT
360 setlocale(LC_ALL,
"");
361 bindtextdomain(PACKAGE_NAME, LOCALEDIR);
363 while ((oc = getopt(argc, argv,
"vh-:i:o:")) != -1) {
373 if (strcmp(optarg,
"version") == 0) {
375 }
else if (strcmp(optarg,
"help") == 0) {
382 strcpy(input_file, optarg);
383 input_file_specified = 1;
386 strcpy(output_file, optarg);
387 output_file_specified = 1;
391 if (!input_file_specified) {
392 fprintf(stderr, _(
"Please specify input file using -i.\n"));
396 if (!output_file_specified) {
397 fprintf(stderr, _(
"Please specify output file using -o.\n"));
404 #ifdef DEBUG_WRITE_TEXT