19 #include "../encoding.h"
22 #define INITIAL_DICTIONARY_SIZE 1024
23 #define ENTRY_BUFF_SIZE 128
24 #define ENTRY_WBUFF_SIZE ENTRY_BUFF_SIZE / sizeof(size_t)
26 int qsort_entry_cmp(
const void* a,
const void* b) {
30 int parse_entry(
const char* buff,
TextEntry* entry_i) {
35 for (pbuff = buff; *pbuff !=
'\t' && *pbuff !=
'\0'; ++pbuff) {}
40 length = pbuff - buff;
43 ucs4_buff = utf8_to_ucs4(buff, length);
45 if (ucs4_buff == (ucs4_t*)-1) {
48 entry_i->key = (ucs4_t*)malloc((length + 1) *
sizeof(ucs4_t));
49 ucs4cpy(entry_i->key, ucs4_buff);
53 size_t value_i, value_count = INITIAL_DICTIONARY_SIZE;
54 entry_i->value = (ucs4_t**)malloc(value_count *
sizeof(ucs4_t*));
56 for (value_i = 0; *pbuff !=
'\0' && *pbuff !=
'\n'; ++value_i) {
57 if (value_i >= value_count) {
58 value_count += value_count;
59 entry_i->value = (ucs4_t**)realloc(
61 value_count *
sizeof(ucs4_t*)
66 *pbuff !=
' ' && *pbuff !=
'\0' && *pbuff !=
'\n' && *pbuff !=
'\r';
68 length = pbuff - buff;
69 ucs4_buff = utf8_to_ucs4(buff, length);
71 if (ucs4_buff == (ucs4_t*)-1) {
75 for (i = value_i - 1; i >= 0; --i) {
76 free(entry_i->value[i]);
83 entry_i->value[value_i] = (ucs4_t*)malloc((length + 1) *
sizeof(ucs4_t));
84 ucs4cpy(entry_i->value[value_i], ucs4_buff);
88 entry_i->value = (ucs4_t**)realloc(
90 value_count *
sizeof(ucs4_t*)
92 entry_i->value[value_i] = NULL;
97 Dict* dict_text_new(
const char* filename) {
101 text_dictionary->entry_count = INITIAL_DICTIONARY_SIZE;
102 text_dictionary->max_length = 0;
103 text_dictionary->lexicon = (
TextEntry*)malloc(
104 sizeof(
TextEntry) * text_dictionary->entry_count);
105 text_dictionary->word_buff = NULL;
107 static char buff[ENTRY_BUFF_SIZE];
109 FILE* fp = fopen(filename,
"r");
112 dict_text_delete((
Dict*)text_dictionary);
119 while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
120 if (i >= text_dictionary->entry_count) {
121 text_dictionary->entry_count += text_dictionary->entry_count;
122 text_dictionary->lexicon = (
TextEntry*)realloc(
123 text_dictionary->lexicon,
124 sizeof(
TextEntry) * text_dictionary->entry_count
128 if (parse_entry(buff, text_dictionary->lexicon + i) == -1) {
129 text_dictionary->entry_count = i;
130 dict_text_delete((
Dict*)text_dictionary);
134 size_t length = ucs4len(text_dictionary->lexicon[i].key);
136 if (length > text_dictionary->max_length) {
137 text_dictionary->max_length = length;
145 text_dictionary->entry_count = i;
146 text_dictionary->lexicon = (
TextEntry*)realloc(
147 text_dictionary->lexicon,
148 sizeof(
TextEntry) * text_dictionary->entry_count
150 text_dictionary->word_buff = (ucs4_t*)
151 malloc(
sizeof(ucs4_t) *
152 (text_dictionary->max_length + 1));
154 qsort(text_dictionary->lexicon,
155 text_dictionary->entry_count,
156 sizeof(text_dictionary->lexicon[0]),
160 return (
Dict*)text_dictionary;
163 void dict_text_delete(
Dict* dict) {
168 for (i = 0; i < text_dictionary->entry_count; ++i) {
169 free(text_dictionary->lexicon[i].key);
173 for (j = text_dictionary->lexicon[i].value; *j; ++j) {
176 free(text_dictionary->lexicon[i].value);
179 free(text_dictionary->lexicon);
180 free(text_dictionary->word_buff);
181 free(text_dictionary);
184 const ucs4_t*
const* dict_text_match_longest(
Dict* dict,
187 size_t* match_length) {
190 if (text_dictionary->entry_count == 0) {
195 maxlen = ucs4len(word);
197 size_t len = text_dictionary->max_length;
203 ucs4ncpy(text_dictionary->word_buff, word, len);
204 text_dictionary->word_buff[len] = L
'\0';
207 buff.key = text_dictionary->word_buff;
209 for (; len > 0; len--) {
210 text_dictionary->word_buff[len] = L
'\0';
213 text_dictionary->lexicon,
214 text_dictionary->entry_count,
215 sizeof(text_dictionary->lexicon[0]),
220 if (match_length != NULL) {
223 return (
const ucs4_t*
const*)brs->value;
227 if (match_length != NULL) {
233 size_t dict_text_get_all_match_lengths(
Dict* dict,
235 size_t* match_length) {
240 if (text_dictionary->entry_count == 0) {
244 size_t length = ucs4len(word);
245 size_t len = text_dictionary->max_length;
251 ucs4ncpy(text_dictionary->word_buff, word, len);
252 text_dictionary->word_buff[len] = L
'\0';
255 buff.key = text_dictionary->word_buff;
257 for (; len > 0; len--) {
258 text_dictionary->word_buff[len] = L
'\0';
261 text_dictionary->lexicon,
262 text_dictionary->entry_count,
263 sizeof(text_dictionary->lexicon[0]),
268 match_length[rscnt++] = len;
275 size_t dict_text_get_lexicon(
Dict* dict,
TextEntry* lexicon) {
280 for (i = 0; i < text_dictionary->entry_count; i++) {
281 lexicon[i].key = text_dictionary->lexicon[i].key;
282 lexicon[i].value = text_dictionary->lexicon[i].value;
285 return text_dictionary->entry_count;