Rev 11124 | Rev 18181 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 11124 | Rev 12248 | ||
---|---|---|---|
Line 21... | Line 21... | ||
21 | #define suflim 256 |
21 | #define suflim 256 |
22 | #define sufbuflim 102400 |
22 | #define sufbuflim 102400 |
23 | 23 | ||
24 | int suffixcnt; |
24 | int suffixcnt; |
25 | struct { |
25 | struct { |
26 |
|
26 | unsigned char *original; |
27 |
|
27 | int olen; |
28 |
|
28 | unsigned char *replace; |
29 | } |
29 | } |
30 | suf[suflim]; |
30 | suf[suflim]; |
31 | char *sufbuf; |
31 | char *sufbuf; |
32 | int sufwordlen, sufminlen; |
32 | int sufwordlen, sufminlen; |
33 | 33 | ||
34 | /* Suffix translation, to be used within translator. */ |
34 | /* Suffix translation, to be used within translator. */ |
35 | 35 | ||
36 | int sufcomp(int t, const unsigned char *s2) |
36 | int sufcomp(int t, const unsigned char *s2) |
37 | { |
37 | { |
38 |
|
38 | int k; |
39 | 39 | ||
40 |
|
40 | for(k=0;k<suf[t].olen && k<sufwordlen |
41 |
|
41 | && suf[t].original[k]==s2[sufwordlen-k-1];k++); |
42 |
|
42 | if(k>=suf[t].olen) { |
43 |
|
43 | if(sufwordlen>k) return -1; else return 0; |
44 |
|
44 | } |
45 |
|
45 | else return suf[t].original[k]-s2[sufwordlen-k-1]; |
46 | } |
46 | } |
47 | 47 | ||
48 | /* searches a list. Returns index if found, -1 if nomatch. |
48 | /* searches a list. Returns index if found, -1 if nomatch. |
49 | * This routine is faster than naive one by one comparisons, |
49 | * This routine is faster than naive one by one comparisons, |
50 | * and is especially suited for large lists. |
50 | * and is especially suited for large lists. |
51 | */ |
51 | */ |
52 | int suffix_list(void *list, int items, size_t item_size, const unsigned char *str) |
52 | int suffix_list(void *list, int items, size_t item_size, const unsigned char *str) |
53 | { |
53 | { |
54 |
|
54 | int i1,i2,j,k,t,v; |
55 |
|
55 | unsigned char c,d; |
56 | 56 | ||
57 |
|
57 | if(items<=0) return -1; |
58 |
|
58 | k=sufcomp(0,str); |
59 |
|
59 | if(k==0) return 0; |
60 |
|
60 | if(k>0) return -1; |
61 |
|
61 | j=items-1; k=sufcomp(j,str); |
- | 62 | if(k==0) return j; |
|
- | 63 | if(k>0) for(i1=0,i2=j;i2>i1+1;) { |
|
- | 64 | j=i1+(i2-i1)/2; k=sufcomp(j,str); |
|
62 | if(k==0) return j; |
65 | if(k==0) return j; |
63 | if(k>0) for(i1=0,i2=j;i2>i1+1;) { |
- | |
64 | j=i1+(i2-i1)/2; k=sufcomp(j,str); |
- | |
65 | if(k==0) return j; |
- | |
66 |
|
66 | if(k>0) {i2=j; continue;} |
67 |
|
67 | if(k<0) {i1=j; continue;} |
68 |
|
68 | } |
69 |
|
69 | if(k>0 && j>0) j--; |
70 |
|
70 | backcheck: |
71 |
|
71 | v=j;for(t=0;t<suf[j].olen && t<sufwordlen |
72 |
|
72 | && suf[j].original[t]==str[sufwordlen-t-1];t++); |
73 |
|
73 | if(t<sufminlen) return -1; |
74 |
|
74 | if(t>=suf[j].olen) return j; |
75 |
|
75 | for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t]; |
76 |
|
76 | j>=0 && suf[j].original[0]==c && suf[j].olen>t |
77 |
|
77 | && suf[j].original[t-1]==d;j--); |
78 |
|
78 | if(j>=0 && suf[j].original[0]==c && |
79 | strncmp((char*)suf[j].original,(char*)suf[v].original,suf[j].olen)==0) |
79 | strncmp((char*)suf[j].original,(char*)suf[v].original,suf[j].olen)==0) |
80 |
|
80 | return j; |
81 |
|
81 | else goto backcheck; |
82 | } |
82 | } |
83 | 83 | ||
84 | /* Prepare dictionary. */ |
84 | /* Prepare dictionary. */ |
85 | void suffix_dic(char *sdicname) |
85 | void suffix_dic(char *sdicname) |
86 | { |
86 | { |
87 |
|
87 | int i,l; |
88 |
|
88 | FILE *suff; |
89 |
|
89 | char *p1, *p2, *pp; |
90 |
|
90 | long int flen; |
91 | 91 | ||
92 |
|
92 | suffixcnt=0; sufminlen=100000; |
93 |
|
93 | suff=fopen(sdicname,"r"); if(suff==NULL) return; |
94 |
|
94 | fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET); |
95 |
|
95 | if(flen>sufbuflim) return; |
96 |
|
96 | sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff); |
97 |
|
97 | fclose(suff); |
98 |
|
98 | if(flen>0 && flen<sufbuflim) sufbuf[flen]=0; |
99 |
|
99 | else return; |
100 |
|
100 | for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) { |
101 |
|
101 | p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0; |
102 |
|
102 | pp=strchr(p1,':'); if(pp==NULL) continue; |
103 |
|
103 | *pp++=0; |
104 |
|
104 | strip_trailing_spaces2(p1); strip_trailing_spaces2(pp); |
105 |
|
105 | singlespace2(p1); |
106 |
|
106 | p1=find_word_start(p1); pp=find_word_start(pp); |
107 |
|
107 | if(*p1==0) continue; |
108 |
|
108 | suf[i].original=(unsigned char*)p1; suf[i].olen=l=strlen(p1); |
109 |
|
109 | if(l<sufminlen) sufminlen=l; |
110 |
|
110 | suf[i].replace=(unsigned char*)pp; i++; |
111 |
|
111 | } |
112 |
|
112 | suffixcnt=i; |
113 | } |
113 | } |
114 | 114 | ||
115 | /* Suffix translation. */ |
115 | /* Suffix translation. */ |
116 | /* FIXME : ne rien faire si le résultat est de longueur inferieur à 2 |
116 | /* FIXME : ne rien faire si le résultat est de longueur inferieur à 2 |
117 | * car ensuite cela sera neglige. |
117 | * car ensuite cela sera neglige. |
118 | */ |
118 | */ |
119 | 119 | ||
120 | void suffix_translate(char *p) |
120 | void suffix_translate(char *p) |
121 | { |
121 | { |
122 |
|
122 | char *p1, *p2; |
123 |
|
123 | int t; |
124 | 124 | ||
125 |
|
125 | for(p1=find_word_start(p); |
126 |
|
126 | p1!=NULL && p1-p<MAX_LINELEN && *p1!=0; |
127 |
|
127 | p1=p2) { |
128 |
|
128 | if(!isalpha(*p1)) {p2=p1+1; continue;} |
129 |
|
129 | for(p2=p1;isalpha(*p2);p2++); |
130 |
|
130 | if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue; |
131 |
|
131 | sufwordlen=p2-p1; |
132 |
|
132 | t=suffix_list(suf,suffixcnt,sizeof(suf[0]),(unsigned char*)p1); |
133 |
|
133 | if(t<0) continue; |
134 |
|
134 | string_modify3(p,p2-suf[t].olen,p2,(char*)suf[t].replace); |
135 |
|
135 | p2=p2-suf[t].olen+strlen((char*)suf[t].replace); |
136 |
|
136 | } |
137 |
|
137 | p[MAX_LINELEN]=0; |
138 | } |
138 | } |
139 | 139 | ||
140 | void suffix(char *p, char *sdicname) |
140 | void suffix(char *p, char *sdicname) |
141 | { |
141 | { |
142 |
|
142 | suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p); |
143 | } |
143 | } |
144 | 144 |