Open Chinese Convert  0.4.3
A project for conversion between Traditional and Simplified Chinese
 All Data Structures Files Functions Variables Groups Pages
text.c
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2010-2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #include "../encoding.h"
20 #include "text.h"
21 
22 #define INITIAL_DICTIONARY_SIZE 1024
23 #define ENTRY_BUFF_SIZE 128
24 #define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
25 
26 int qsort_entry_cmp(const void* a, const void* b) {
27  return ucs4cmp(((TextEntry*)a)->key, ((TextEntry*)b)->key);
28 }
29 
30 int parse_entry(const char* buff, TextEntry* entry_i) {
31  size_t length;
32  const char* pbuff;
33 
34  /* 解析鍵 */
35  for (pbuff = buff; *pbuff != '\t' && *pbuff != '\0'; ++pbuff) {}
36 
37  if (*pbuff == '\0') {
38  return -1;
39  }
40  length = pbuff - buff;
41 
42  ucs4_t* ucs4_buff;
43  ucs4_buff = utf8_to_ucs4(buff, length);
44 
45  if (ucs4_buff == (ucs4_t*)-1) {
46  return -1;
47  }
48  entry_i->key = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t));
49  ucs4cpy(entry_i->key, ucs4_buff);
50  free(ucs4_buff);
51 
52  /* 解析值 */
53  size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
54  entry_i->value = (ucs4_t**)malloc(value_count * sizeof(ucs4_t*));
55 
56  for (value_i = 0; *pbuff != '\0' && *pbuff != '\n'; ++value_i) {
57  if (value_i >= value_count) {
58  value_count += value_count;
59  entry_i->value = (ucs4_t**)realloc(
60  entry_i->value,
61  value_count * sizeof(ucs4_t*)
62  );
63  }
64 
65  for (buff = ++pbuff;
66  *pbuff != ' ' && *pbuff != '\0' && *pbuff != '\n' && *pbuff != '\r';
67  ++pbuff) {}
68  length = pbuff - buff;
69  ucs4_buff = utf8_to_ucs4(buff, length);
70 
71  if (ucs4_buff == (ucs4_t*)-1) {
72  /* 發生錯誤 回退內存申請 */
73  ssize_t i;
74 
75  for (i = value_i - 1; i >= 0; --i) {
76  free(entry_i->value[i]);
77  }
78  free(entry_i->value);
79  free(entry_i->key);
80  return -1;
81  }
82 
83  entry_i->value[value_i] = (ucs4_t*)malloc((length + 1) * sizeof(ucs4_t));
84  ucs4cpy(entry_i->value[value_i], ucs4_buff);
85  free(ucs4_buff);
86  }
87 
88  entry_i->value = (ucs4_t**)realloc(
89  entry_i->value,
90  value_count * sizeof(ucs4_t*)
91  );
92  entry_i->value[value_i] = NULL;
93 
94  return 0;
95 }
96 
97 Dict* dict_text_new(const char* filename) {
98  TextDict* text_dictionary;
99 
100  text_dictionary = (TextDict*)malloc(sizeof(TextDict));
101  text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
102  text_dictionary->max_length = 0;
103  text_dictionary->lexicon = (TextEntry*)malloc(
104  sizeof(TextEntry) * text_dictionary->entry_count);
105  text_dictionary->word_buff = NULL;
106 
107  static char buff[ENTRY_BUFF_SIZE];
108 
109  FILE* fp = fopen(filename, "r");
110 
111  if (fp == NULL) {
112  dict_text_delete((Dict*)text_dictionary);
113  return (Dict*)-1;
114  }
115  skip_utf8_bom(fp);
116 
117  size_t i = 0;
118 
119  while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
120  if (i >= text_dictionary->entry_count) {
121  text_dictionary->entry_count += text_dictionary->entry_count;
122  text_dictionary->lexicon = (TextEntry*)realloc(
123  text_dictionary->lexicon,
124  sizeof(TextEntry) * text_dictionary->entry_count
125  );
126  }
127 
128  if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
129  text_dictionary->entry_count = i;
130  dict_text_delete((Dict*)text_dictionary);
131  return (Dict*)-1;
132  }
133 
134  size_t length = ucs4len(text_dictionary->lexicon[i].key);
135 
136  if (length > text_dictionary->max_length) {
137  text_dictionary->max_length = length;
138  }
139 
140  i++;
141  }
142 
143  fclose(fp);
144 
145  text_dictionary->entry_count = i;
146  text_dictionary->lexicon = (TextEntry*)realloc(
147  text_dictionary->lexicon,
148  sizeof(TextEntry) * text_dictionary->entry_count
149  );
150  text_dictionary->word_buff = (ucs4_t*)
151  malloc(sizeof(ucs4_t) *
152  (text_dictionary->max_length + 1));
153 
154  qsort(text_dictionary->lexicon,
155  text_dictionary->entry_count,
156  sizeof(text_dictionary->lexicon[0]),
157  qsort_entry_cmp
158  );
159 
160  return (Dict*)text_dictionary;
161 }
162 
163 void dict_text_delete(Dict* dict) {
164  TextDict* text_dictionary = (TextDict*)dict;
165 
166  size_t i;
167 
168  for (i = 0; i < text_dictionary->entry_count; ++i) {
169  free(text_dictionary->lexicon[i].key);
170 
171  ucs4_t** j;
172 
173  for (j = text_dictionary->lexicon[i].value; *j; ++j) {
174  free(*j);
175  }
176  free(text_dictionary->lexicon[i].value);
177  }
178 
179  free(text_dictionary->lexicon);
180  free(text_dictionary->word_buff);
181  free(text_dictionary);
182 }
183 
184 const ucs4_t* const* dict_text_match_longest(Dict* dict,
185  const ucs4_t* word,
186  size_t maxlen,
187  size_t* match_length) {
188  TextDict* text_dictionary = (TextDict*)dict;
189 
190  if (text_dictionary->entry_count == 0) {
191  return NULL;
192  }
193 
194  if (maxlen == 0) {
195  maxlen = ucs4len(word);
196  }
197  size_t len = text_dictionary->max_length;
198 
199  if (maxlen < len) {
200  len = maxlen;
201  }
202 
203  ucs4ncpy(text_dictionary->word_buff, word, len);
204  text_dictionary->word_buff[len] = L'\0';
205 
206  TextEntry buff;
207  buff.key = text_dictionary->word_buff;
208 
209  for (; len > 0; len--) {
210  text_dictionary->word_buff[len] = L'\0';
211  TextEntry* brs = (TextEntry*)bsearch(
212  &buff,
213  text_dictionary->lexicon,
214  text_dictionary->entry_count,
215  sizeof(text_dictionary->lexicon[0]),
216  qsort_entry_cmp
217  );
218 
219  if (brs != NULL) {
220  if (match_length != NULL) {
221  *match_length = len;
222  }
223  return (const ucs4_t* const*)brs->value;
224  }
225  }
226 
227  if (match_length != NULL) {
228  *match_length = 0;
229  }
230  return NULL;
231 }
232 
233 size_t dict_text_get_all_match_lengths(Dict* dict,
234  const ucs4_t* word,
235  size_t* match_length) {
236  TextDict* text_dictionary = (TextDict*)dict;
237 
238  size_t rscnt = 0;
239 
240  if (text_dictionary->entry_count == 0) {
241  return rscnt;
242  }
243 
244  size_t length = ucs4len(word);
245  size_t len = text_dictionary->max_length;
246 
247  if (length < len) {
248  len = length;
249  }
250 
251  ucs4ncpy(text_dictionary->word_buff, word, len);
252  text_dictionary->word_buff[len] = L'\0';
253 
254  TextEntry buff;
255  buff.key = text_dictionary->word_buff;
256 
257  for (; len > 0; len--) {
258  text_dictionary->word_buff[len] = L'\0';
259  TextEntry* brs = (TextEntry*)bsearch(
260  &buff,
261  text_dictionary->lexicon,
262  text_dictionary->entry_count,
263  sizeof(text_dictionary->lexicon[0]),
264  qsort_entry_cmp
265  );
266 
267  if (brs != NULL) {
268  match_length[rscnt++] = len;
269  }
270  }
271 
272  return rscnt;
273 }
274 
275 size_t dict_text_get_lexicon(Dict* dict, TextEntry* lexicon) {
276  TextDict* text_dictionary = (TextDict*)dict;
277 
278  size_t i;
279 
280  for (i = 0; i < text_dictionary->entry_count; i++) {
281  lexicon[i].key = text_dictionary->lexicon[i].key;
282  lexicon[i].value = text_dictionary->lexicon[i].value;
283  }
284 
285  return text_dictionary->entry_count;
286 }