Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
 All Data Structures Files Functions Variables Groups Pages
opencc_dict.c
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "../dictionary/datrie.h"
20 #include "../dictionary/text.h"
21 #include "../dict_group.h"
22 #include "../encoding.h"
23 #include "../utils.h"
24 #include <locale.h>
25 #include <unistd.h>
26 
27 #ifndef VERSION
28 #define VERSION ""
29 #endif
30 
31 #define DATRIE_SIZE 1000000
32 #define DATRIE_WORD_MAX_COUNT 500000
33 #define DATRIE_WORD_MAX_LENGTH 32
34 #define BUFFER_SIZE 1024
35 
36 typedef struct {
37  uint32_t cursor;
38  ucs4_t* pointer;
39 } Value;
40 
41 typedef struct {
42  ucs4_t* key;
43  Value* value;
44  size_t length;
45  size_t value_count;
46 } Entry;
47 
48 Entry lexicon[DATRIE_WORD_MAX_COUNT];
49 uint32_t lexicon_count, words_set_count;
50 int words_set[DATRIE_WORD_MAX_COUNT];
51 ucs4_t words_set_char[DATRIE_WORD_MAX_COUNT];
52 DatrieItem dat[DATRIE_SIZE];
53 uint32_t lexicon_index_length, lexicon_cursor_end;
54 
55 void match_word(const DatrieItem* dat,
56  const ucs4_t* word,
57  int* match_pos,
58  int* id,
59  int limit) {
60  int i, j, p;
61  for (i = 0, p = 0;
62  word[p] && (limit == 0 || p < limit) && dat[i].base != DATRIE_UNUSED;
63  p++) {
64  int k = encode_char(word[p]);
65  j = dat[i].base + k;
66  if ((j < 0) || (j > DATRIE_SIZE) || (dat[j].parent != i)) {
67  break;
68  }
69  i = j;
70  }
71  if (match_pos) {
72  *match_pos = p;
73  }
74  if (id) {
75  *id = i;
76  }
77 }
78 
79 int unused(int i) {
80  if ((i >= 0) && (i < DATRIE_SIZE)) {
81  return dat[i].parent == DATRIE_UNUSED;
82  }
83  return 0;
84 }
85 
86 int is_prefix(const ucs4_t* a, const ucs4_t* b) {
87  const ucs4_t* p = a, * q = b;
88  while (*p != 0) {
89  if (*q == 0) {
90  return 0;
91  }
92  if (*p != *q) {
93  return 0;
94  }
95  p++;
96  q++;
97  }
98  return 1;
99 }
100 
101 int binary_search(const ucs4_t* str) {
102  int a = 0, b = lexicon_count - 1, c;
103  while (a + 1 < b) {
104  c = (a + b) / 2;
105 
106  if (ucs4cmp(str, lexicon[c].key) <= 0) {
107  b = c;
108  } else {
109  a = c + 1;
110  }
111  }
112  if (is_prefix(str,
113  lexicon[a].key) &&
114  ((a == 0) || !is_prefix(str, lexicon[a - 1].key))) {
115  return a;
116  }
117  if (is_prefix(str, lexicon[b].key) && !is_prefix(str, lexicon[b - 1].key)) {
118  return b;
119  }
120  return -1;
121 }
122 
123 int wcmp(const void* a, const void* b) {
124  return *(const ucs4_t*)a < *(const ucs4_t*)b ? -1 : 1;
125 }
126 
127 void get_words_with_prefix(ucs4_t* word, int p) {
128  int i;
129  static ucs4_t buff[DATRIE_WORD_MAX_LENGTH];
130  static ucs4_t words_set_char_buff[DATRIE_WORD_MAX_COUNT];
131 
132  for (i = 0; i < p; i++) {
133  buff[i] = word[i];
134  }
135  buff[p] = 0;
136  words_set_count = 0;
137  for (i = binary_search(buff);
138  (uint32_t)i < lexicon_count && is_prefix(buff, lexicon[i].key); i++) {
139  if (ucs4cmp(buff, lexicon[i].key) == 0) {
140  continue;
141  }
142  words_set_char_buff[words_set_count] = lexicon[i].key[p];
143  words_set[words_set_count++] = i;
144  }
145  words_set_char_buff[words_set_count] = 0;
146  qsort(words_set_char_buff, words_set_count, sizeof(words_set_char_buff[0]),
147  wcmp);
148  ucs4_t* wfp, * wp, last;
149  for (last = 0, wfp = words_set_char_buff, wp = words_set_char; *wfp; wfp++) {
150  if (*wfp != last) {
151  last = *wfp;
152  *wp = *wfp;
153  wp++;
154  }
155  }
156  *wp = 0;
157 }
158 
159 int words_space_available(int delta) {
160  ucs4_t* wp;
161  for (wp = words_set_char; *wp; wp++) {
162  if (!unused(encode_char(*wp) + delta)) {
163  return 0;
164  }
165  }
166  return 1;
167 }
168 
169 void insert_first_char(int id) {
170  Entry* word = lexicon + id;
171  int key = encode_char(word->key[0]);
172  dat[key].base = DATRIE_UNUSED;
173  dat[key].parent = 0;
174  if (word->length == 1) {
175  dat[key].word = (id);
176  }
177 }
178 
179 void insert_words(int delta, int parent, size_t word_len) {
180  int i;
181  for (i = 0; (uint32_t)i < words_set_count; i++) {
182  int j = words_set[i];
183  int k = encode_char(lexicon[j].key[word_len]) + delta;
184  dat[k].parent = parent;
185  if (lexicon[j].length == word_len + 1) {
186  dat[k].word = (j);
187  }
188  }
189 }
190 
191 void insert(int id) {
192  static int space_min = 0;
193  Entry* word = &lexicon[id];
194  for (;;) {
195  int p, i;
196  match_word(dat, word->key, &p, &i, 0);
197  if ((size_t)p == word->length) {
198  return;
199  }
200  get_words_with_prefix(word->key, p);
201  int delta;
202  delta = space_min - words_set_char[0];
203  for (; delta < DATRIE_SIZE; delta++) {
204  if (words_space_available(delta)) {
205  break;
206  }
207  }
208  if (delta == DATRIE_SIZE) {
209  fprintf(stderr, "DATRIE_SIZE Not Enough!\n");
210  exit(1);
211  }
212  insert_words(delta, i, p);
213  dat[i].base = delta;
214  while (!unused(space_min)) {
215  space_min++;
216  }
217  }
218 }
219 
220 void make(void) {
221  size_t i;
222  for (i = 1; i < DATRIE_SIZE; i++) {
223  dat[i].parent = dat[i].base = DATRIE_UNUSED;
224  dat[i].word = -1;
225  }
226  dat[0].parent = dat[0].base = 0;
227  for (i = 0; i < lexicon_count; i++) {
228  insert_first_char(i);
229  }
230  for (i = 0; i < lexicon_count; i++) {
231  insert(i);
232  }
233 }
234 
235 int cmp(const void* a, const void* b) {
236  return ucs4cmp(((const TextEntry*)a)->key, ((const TextEntry*)b)->key);
237 }
238 
239 void init(const char* filename) {
240  DictGroup* DictGroup = dict_group_new(NULL);
241  if (dict_group_load(DictGroup, filename,
242  OPENCC_DICTIONARY_TYPE_TEXT) == -1) {
243  dictionary_perror("Dictionary loading error");
244  fprintf(stderr, _("\n"));
245  exit(1);
246  }
247  Dict* dict_abs = dict_group_get_dict(DictGroup, 0);
248  if (dict_abs == (Dict*)-1) {
249  dictionary_perror("Dictionary loading error");
250  fprintf(stderr, _("\n"));
251  exit(1);
252  }
253  static TextEntry tlexicon[DATRIE_WORD_MAX_COUNT];
254  /* TODO add datrie support */
255  Dict* dictionary = dict_abs->dict;
256  lexicon_count = dict_text_get_lexicon(dictionary, tlexicon);
257  qsort(tlexicon, lexicon_count, sizeof(tlexicon[0]), cmp);
258  size_t i;
259  size_t lexicon_cursor = 0;
260  for (i = 0; i < lexicon_count; i++) {
261  lexicon[i].key = tlexicon[i].key;
262  lexicon[i].length = ucs4len(lexicon[i].key);
263  size_t j;
264  for (j = 0; tlexicon[i].value[j] != NULL; j++) {}
265  lexicon[i].value_count = j;
266  lexicon_index_length += lexicon[i].value_count + 1;
267  lexicon[i].value = (Value*)malloc(lexicon[i].value_count * sizeof(Value));
268  for (j = 0; j < lexicon[i].value_count; j++) {
269  lexicon[i].value[j].cursor = lexicon_cursor;
270  lexicon[i].value[j].pointer = tlexicon[i].value[j];
271  lexicon_cursor += ucs4len(tlexicon[i].value[j]) + 1;
272  }
273  }
274  lexicon_cursor_end = lexicon_cursor;
275 }
276 
277 void output(const char* file_name) {
278  FILE* fp = fopen(file_name, "wb");
279  if (!fp) {
280  fprintf(stderr, _("Can not write file: %s\n"), file_name);
281  exit(1);
282  }
283  uint32_t i, item_count;
284  for (i = DATRIE_SIZE - 1; i > 0; i--) {
285  if (dat[i].parent != DATRIE_UNUSED) {
286  break;
287  }
288  }
289  item_count = i + 1;
290  fwrite("OPENCCDATRIE", sizeof(char), strlen("OPENCCDATRIE"), fp);
291  /* 詞彙表長度 */
292  fwrite(&lexicon_cursor_end, sizeof(uint32_t), 1, fp);
293  for (i = 0; i < lexicon_count; i++) {
294  size_t j;
295  for (j = 0; j < lexicon[i].value_count; j++) {
296  fwrite(lexicon[i].value[j].pointer, sizeof(ucs4_t),
297  ucs4len(lexicon[i].value[j].pointer) + 1, fp);
298  }
299  }
300  /* 詞彙索引表長度 */
301  fwrite(&lexicon_index_length, sizeof(uint32_t), 1, fp);
302  for (i = 0; i < lexicon_count; i++) {
303  size_t j;
304  for (j = 0; j < lexicon[i].value_count; j++) {
305  fwrite(&lexicon[i].value[j].cursor, sizeof(uint32_t), 1, fp);
306  }
307  uint32_t dem = (uint32_t)-1;
308  fwrite(&dem, sizeof(uint32_t), 1, fp); /* 分隔符 */
309  }
310  fwrite(&lexicon_count, sizeof(uint32_t), 1, fp);
311  fwrite(&item_count, sizeof(uint32_t), 1, fp);
312  fwrite(dat, sizeof(dat[0]), item_count, fp);
313  fclose(fp);
314 }
315 
316 #ifdef DEBUG_WRITE_TEXT
317 void write_text_file() {
318  FILE* fp;
319  int i;
320  fp = fopen("datrie.txt", "w");
321  fprintf(fp, "%d\n", lexicon_count);
322  for (i = 0; i < lexicon_count; i++) {
323  char* buff = ucs4_to_utf8(lexicon[i].value, (size_t)-1);
324  fprintf(fp, "%s\n", buff);
325  free(buff);
326  }
327  for (i = 0; i < DATRIE_SIZE; i++) {
328  if (dat[i].parent != DATRIE_UNUSED) {
329  fprintf(fp, "%d %d %d %d\n", i, dat[i].base, dat[i].parent, dat[i].word);
330  }
331  }
332  fclose(fp);
333 }
334 
335 #endif /* ifdef DEBUG_WRITE_TEXT */
336 
337 void show_version() {
338  printf(_("\nOpen Chinese Convert (OpenCC) Dictionary Tool\nVersion %s\n\n"),
339  VERSION);
340 }
341 
342 void show_usage() {
343  show_version();
344  printf(_("Usage:\n"));
345  printf(_(" opencc_dict -i input_file -o output_file\n\n"));
346  printf(_(" -i input_file\n"));
347  printf(_(" Read data from input_file.\n"));
348  printf(_(" -o output_file\n"));
349  printf(_(" Write converted data to output_file.\n"));
350  printf(_("\n"));
351  printf(_("\n"));
352 }
353 
354 int main(int argc, char** argv) {
355  static int oc;
356  static char input_file[BUFFER_SIZE], output_file[BUFFER_SIZE];
357  int input_file_specified = 0, output_file_specified = 0;
358 
359 #ifdef ENABLE_GETTEXT
360  setlocale(LC_ALL, "");
361  bindtextdomain(PACKAGE_NAME, LOCALEDIR);
362 #endif /* ifdef ENABLE_GETTEXT */
363  while ((oc = getopt(argc, argv, "vh-:i:o:")) != -1) {
364  switch (oc) {
365  case 'v':
366  show_version();
367  return 0;
368  case 'h':
369  case '?':
370  show_usage();
371  return 0;
372  case '-':
373  if (strcmp(optarg, "version") == 0) {
374  show_version();
375  } else if (strcmp(optarg, "help") == 0) {
376  show_usage();
377  } else {
378  show_usage();
379  }
380  return 0;
381  case 'i':
382  strcpy(input_file, optarg);
383  input_file_specified = 1;
384  break;
385  case 'o':
386  strcpy(output_file, optarg);
387  output_file_specified = 1;
388  break;
389  }
390  }
391  if (!input_file_specified) {
392  fprintf(stderr, _("Please specify input file using -i.\n"));
393  show_usage();
394  return 1;
395  }
396  if (!output_file_specified) {
397  fprintf(stderr, _("Please specify output file using -o.\n"));
398  show_usage();
399  return 1;
400  }
401  init(input_file);
402  make();
403  output(output_file);
404 #ifdef DEBUG_WRITE_TEXT
405  write_text_file();
406 #endif /* ifdef DEBUG_WRITE_TEXT */
407  return 0;
408 }