Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
 All Data Structures Files Functions Variables Groups Pages
datrie.c
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "datrie.h"
20 #include <fcntl.h>
21 #include <unistd.h>
22 
23 #ifdef __WIN32
24 
25 /* Todo: Win32 mmap*/
26 #else /* ifdef __WIN32 */
27 # include <sys/mman.h>
28 # define MMAP_ENABLED
29 #endif /* ifdef __WIN32 */
30 
31 typedef enum {
32  MEMORY_TYPE_MMAP,
33  MEMORY_TYPE_ALLOCATE
34 } memory_type;
35 
36 typedef struct {
37  const DatrieItem* dat;
38  uint32_t dat_item_count;
39  ucs4_t* lexicon;
40  uint32_t lexicon_count;
41 
42  ucs4_t*** lexicon_set;
43  void* dic_memory;
44  size_t dic_size;
45  memory_type dic_memory_type;
46 } DatrieDict;
47 
48 static int load_allocate(DatrieDict* datrie_dictionary, int fd) {
49  datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
50  datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
51 
52  if (datrie_dictionary->dic_memory == NULL) {
53  /* 內存申請失敗 */
54  return -1;
55  }
56  lseek(fd, 0, SEEK_SET);
57 
58  if (read(fd, datrie_dictionary->dic_memory,
59  datrie_dictionary->dic_size) == -1) {
60  /* 讀取失敗 */
61  return -1;
62  }
63  return 0;
64 }
65 
66 static int load_mmap(DatrieDict* datrie_dictionary, int fd) {
67 #ifdef MMAP_ENABLED
68  datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
69  datrie_dictionary->dic_memory = mmap(NULL,
70  datrie_dictionary->dic_size,
71  PROT_READ,
72  MAP_PRIVATE,
73  fd,
74  0);
75 
76  if (datrie_dictionary->dic_memory == MAP_FAILED) {
77  /* 內存映射創建失敗 */
78  datrie_dictionary->dic_memory = NULL;
79  return -1;
80  }
81  return 0;
82 
83 #else /* ifdef MMAP_ENABLED */
84  return -1;
85 
86 #endif /* ifdef MMAP_ENABLED */
87 }
88 
89 static int load_dict(DatrieDict* datrie_dictionary, FILE* fp) {
90  int fd = fileno(fp);
91 
92  fseek(fp, 0, SEEK_END);
93  datrie_dictionary->dic_size = ftell(fp);
94 
95  /* 首先嘗試mmap,如果失敗嘗試申請內存 */
96  if (load_mmap(datrie_dictionary, fd) == -1) {
97  if (load_allocate(datrie_dictionary, fd) == -1) {
98  return -1;
99  }
100  }
101 
102  size_t header_len = strlen("OPENCCDATRIE");
103 
104  if (strncmp((const char*)datrie_dictionary->dic_memory, "OPENCCDATRIE",
105  header_len) != 0) {
106  return -1;
107  }
108 
109  size_t offset = 0;
110 
111  offset += header_len * sizeof(char);
112 
113  /* 詞彙表 */
114  uint32_t lexicon_length =
115  *((uint32_t*)(datrie_dictionary->dic_memory + offset));
116  offset += sizeof(uint32_t);
117 
118  datrie_dictionary->lexicon = (ucs4_t*)(datrie_dictionary->dic_memory + offset);
119  offset += lexicon_length * sizeof(ucs4_t);
120 
121  /* 詞彙索引表 */
122  uint32_t lexicon_index_length =
123  *((uint32_t*)(datrie_dictionary->dic_memory + offset));
124  offset += sizeof(uint32_t);
125 
126  uint32_t* lexicon_index = (uint32_t*)(datrie_dictionary->dic_memory + offset);
127  offset += lexicon_index_length * sizeof(uint32_t);
128 
129  datrie_dictionary->lexicon_count =
130  *((uint32_t*)(datrie_dictionary->dic_memory + offset));
131  offset += sizeof(uint32_t);
132 
133  datrie_dictionary->dat_item_count =
134  *((uint32_t*)(datrie_dictionary->dic_memory + offset));
135  offset += sizeof(uint32_t);
136 
137  datrie_dictionary->dat =
138  (DatrieItem*)(datrie_dictionary->dic_memory + offset);
139 
140  /* 構造索引表 */
141  datrie_dictionary->lexicon_set = (ucs4_t***)malloc(
142  datrie_dictionary->lexicon_count * sizeof(ucs4_t * *));
143  size_t i, last = 0;
144 
145  for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
146  size_t count, j;
147 
148  for (j = last; j < lexicon_index_length; j++) {
149  if (lexicon_index[j] == (uint32_t)-1) {
150  break;
151  }
152  }
153  count = j - last;
154 
155  datrie_dictionary->lexicon_set[i] =
156  (ucs4_t**)malloc((count + 1) * sizeof(ucs4_t*));
157 
158  for (j = 0; j < count; j++) {
159  datrie_dictionary->lexicon_set[i][j] =
160  datrie_dictionary->lexicon + lexicon_index[last + j];
161  }
162  datrie_dictionary->lexicon_set[i][count] = NULL;
163  last += j + 1;
164  }
165 
166  return 0;
167 }
168 
169 static int unload_dict(DatrieDict* datrie_dictionary) {
170  if (datrie_dictionary->dic_memory != NULL) {
171  size_t i;
172 
173  for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
174  free(datrie_dictionary->lexicon_set[i]);
175  }
176  free(datrie_dictionary->lexicon_set);
177 
178  if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
179  #ifdef MMAP_ENABLED
180  return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
181 
182  #else /* ifdef MMAP_ENABLED */
183  debug_should_not_be_here();
184  #endif /* ifdef MMAP_ENABLED */
185  } else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
186  free(datrie_dictionary->dic_memory);
187  } else {
188  return -1;
189  }
190  }
191  return 0;
192 }
193 
194 Dict* dict_datrie_new(const char* filename) {
195  DatrieDict* datrie_dictionary = (DatrieDict*)malloc(
196  sizeof(DatrieDict));
197 
198  datrie_dictionary->dat = NULL;
199  datrie_dictionary->lexicon = NULL;
200 
201  FILE* fp = fopen(filename, "rb");
202 
203  if (load_dict(datrie_dictionary, fp) == -1) {
204  dict_datrie_delete((Dict*)datrie_dictionary);
205  return (Dict*)-1;
206  }
207 
208  fclose(fp);
209 
210  return (Dict*)datrie_dictionary;
211 }
212 
213 int dict_datrie_delete(Dict* dict) {
214  DatrieDict* datrie_dictionary =
215  (DatrieDict*)dict;
216 
217  if (unload_dict(datrie_dictionary) == -1) {
218  free(datrie_dictionary);
219  return -1;
220  }
221 
222  free(datrie_dictionary);
223  return 0;
224 }
225 
226 int encode_char(ucs4_t ch) {
227  return (int)ch;
228 }
229 
230 void datrie_match(const DatrieDict* datrie_dictionary,
231  const ucs4_t* word,
232  size_t* match_pos,
233  size_t* id,
234  size_t limit) {
235  int i, p;
236 
237  for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) &&
238  datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
239  int k = encode_char(word[p]);
240  int j = datrie_dictionary->dat[i].base + k;
241 
242  if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) ||
243  (datrie_dictionary->dat[j].parent != i)) {
244  break;
245  }
246  i = j;
247  }
248 
249  if (match_pos) {
250  *match_pos = p;
251  }
252 
253  if (id) {
254  *id = i;
255  }
256 }
257 
258 const ucs4_t* const* dict_datrie_match_longest(Dict* dict,
259  const ucs4_t* word,
260  size_t maxlen,
261  size_t* match_length) {
262  DatrieDict* datrie_dictionary =
263  (DatrieDict*)dict;
264 
265  size_t pos, item;
266 
267  datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
268 
269  while (datrie_dictionary->dat[item].word == -1 && pos > 1) {
270  datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
271  }
272 
273  if ((pos == 0) || (datrie_dictionary->dat[item].word == -1)) {
274  if (match_length != NULL) {
275  *match_length = 0;
276  }
277  return NULL;
278  }
279 
280  if (match_length != NULL) {
281  *match_length = pos;
282  }
283 
284  return (const ucs4_t* const*)
285  datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
286 }
287 
288 size_t dict_datrie_get_all_match_lengths(Dict* dict,
289  const ucs4_t* word,
290  size_t* match_length) {
291  DatrieDict* datrie_dictionary =
292  (DatrieDict*)dict;
293 
294  size_t rscnt = 0;
295 
296  int i, p;
297 
298  for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED;
299  p++) {
300  int k = encode_char(word[p]);
301  int j = datrie_dictionary->dat[i].base + k;
302 
303  if ((j < 0) || ((size_t)j >= datrie_dictionary->dat_item_count) ||
304  (datrie_dictionary->dat[j].parent != i)) {
305  break;
306  }
307  i = j;
308 
309  if (datrie_dictionary->dat[i].word != -1) {
310  match_length[rscnt++] = p + 1;
311  }
312  }
313 
314  return rscnt;
315 }