27 # include <sys/mman.h>
38 uint32_t dat_item_count;
40 uint32_t lexicon_count;
42 ucs4_t*** lexicon_set;
45 memory_type dic_memory_type;
48 static int load_allocate(
DatrieDict* datrie_dictionary,
int fd) {
49 datrie_dictionary->dic_memory_type = MEMORY_TYPE_ALLOCATE;
50 datrie_dictionary->dic_memory = malloc(datrie_dictionary->dic_size);
52 if (datrie_dictionary->dic_memory == NULL) {
56 lseek(fd, 0, SEEK_SET);
58 if (read(fd, datrie_dictionary->dic_memory,
59 datrie_dictionary->dic_size) == -1) {
66 static int load_mmap(
DatrieDict* datrie_dictionary,
int fd) {
68 datrie_dictionary->dic_memory_type = MEMORY_TYPE_MMAP;
69 datrie_dictionary->dic_memory = mmap(NULL,
70 datrie_dictionary->dic_size,
76 if (datrie_dictionary->dic_memory == MAP_FAILED) {
78 datrie_dictionary->dic_memory = NULL;
89 static int load_dict(
DatrieDict* datrie_dictionary, FILE* fp) {
92 fseek(fp, 0, SEEK_END);
93 datrie_dictionary->dic_size = ftell(fp);
96 if (load_mmap(datrie_dictionary, fd) == -1) {
97 if (load_allocate(datrie_dictionary, fd) == -1) {
102 size_t header_len = strlen(
"OPENCCDATRIE");
104 if (strncmp((
const char*)datrie_dictionary->dic_memory,
"OPENCCDATRIE",
111 offset += header_len *
sizeof(char);
114 uint32_t lexicon_length =
115 *((uint32_t*)(datrie_dictionary->dic_memory + offset));
116 offset +=
sizeof(uint32_t);
118 datrie_dictionary->lexicon = (ucs4_t*)(datrie_dictionary->dic_memory + offset);
119 offset += lexicon_length *
sizeof(ucs4_t);
122 uint32_t lexicon_index_length =
123 *((uint32_t*)(datrie_dictionary->dic_memory + offset));
124 offset +=
sizeof(uint32_t);
126 uint32_t* lexicon_index = (uint32_t*)(datrie_dictionary->dic_memory + offset);
127 offset += lexicon_index_length *
sizeof(uint32_t);
129 datrie_dictionary->lexicon_count =
130 *((uint32_t*)(datrie_dictionary->dic_memory + offset));
131 offset +=
sizeof(uint32_t);
133 datrie_dictionary->dat_item_count =
134 *((uint32_t*)(datrie_dictionary->dic_memory + offset));
135 offset +=
sizeof(uint32_t);
137 datrie_dictionary->dat =
138 (
DatrieItem*)(datrie_dictionary->dic_memory + offset);
141 datrie_dictionary->lexicon_set = (ucs4_t***)malloc(
142 datrie_dictionary->lexicon_count *
sizeof(ucs4_t * *));
145 for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
148 for (j = last; j < lexicon_index_length; j++) {
149 if (lexicon_index[j] == (uint32_t)-1) {
155 datrie_dictionary->lexicon_set[i] =
156 (ucs4_t**)malloc((count + 1) *
sizeof(ucs4_t*));
158 for (j = 0; j < count; j++) {
159 datrie_dictionary->lexicon_set[i][j] =
160 datrie_dictionary->lexicon + lexicon_index[last + j];
162 datrie_dictionary->lexicon_set[i][count] = NULL;
169 static int unload_dict(
DatrieDict* datrie_dictionary) {
170 if (datrie_dictionary->dic_memory != NULL) {
173 for (i = 0; i < datrie_dictionary->lexicon_count; i++) {
174 free(datrie_dictionary->lexicon_set[i]);
176 free(datrie_dictionary->lexicon_set);
178 if (MEMORY_TYPE_MMAP == datrie_dictionary->dic_memory_type) {
180 return munmap(datrie_dictionary->dic_memory, datrie_dictionary->dic_size);
183 debug_should_not_be_here();
185 }
else if (MEMORY_TYPE_ALLOCATE == datrie_dictionary->dic_memory_type) {
186 free(datrie_dictionary->dic_memory);
194 Dict* dict_datrie_new(
const char* filename) {
198 datrie_dictionary->dat = NULL;
199 datrie_dictionary->lexicon = NULL;
201 FILE* fp = fopen(filename,
"rb");
203 if (load_dict(datrie_dictionary, fp) == -1) {
204 dict_datrie_delete((
Dict*)datrie_dictionary);
210 return (
Dict*)datrie_dictionary;
213 int dict_datrie_delete(
Dict* dict) {
217 if (unload_dict(datrie_dictionary) == -1) {
218 free(datrie_dictionary);
222 free(datrie_dictionary);
226 int encode_char(ucs4_t ch) {
230 void datrie_match(
const DatrieDict* datrie_dictionary,
237 for (i = 0, p = 0; word[p] && (limit == 0 || (size_t)p < limit) &&
238 datrie_dictionary->dat[i].base != DATRIE_UNUSED; p++) {
239 int k = encode_char(word[p]);
240 int j = datrie_dictionary->dat[i].base + k;
242 if ((j < 0) || ((
size_t)j >= datrie_dictionary->dat_item_count) ||
243 (datrie_dictionary->dat[j].parent != i)) {
258 const ucs4_t*
const* dict_datrie_match_longest(
Dict* dict,
261 size_t* match_length) {
267 datrie_match(datrie_dictionary, word, &pos, &item, maxlen);
269 while (datrie_dictionary->dat[item].word == -1 && pos > 1) {
270 datrie_match(datrie_dictionary, word, &pos, &item, pos - 1);
273 if ((pos == 0) || (datrie_dictionary->dat[item].word == -1)) {
274 if (match_length != NULL) {
280 if (match_length != NULL) {
284 return (
const ucs4_t*
const*)
285 datrie_dictionary->lexicon_set[datrie_dictionary->dat[item].word];
288 size_t dict_datrie_get_all_match_lengths(
Dict* dict,
290 size_t* match_length) {
298 for (i = 0, p = 0; word[p] && datrie_dictionary->dat[i].base != DATRIE_UNUSED;
300 int k = encode_char(word[p]);
301 int j = datrie_dictionary->dat[i].base + k;
303 if ((j < 0) || ((
size_t)j >= datrie_dictionary->dat_item_count) ||
304 (datrie_dictionary->dat[j].parent != i)) {
309 if (datrie_dictionary->dat[i].word != -1) {
310 match_length[rscnt++] = p + 1;