Rev 3718 | Rev 8100 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 3718 | Rev 7676 | ||
---|---|---|---|
Line 13... | Line 13... | ||
13 | * You should have received a copy of the GNU General Public License |
13 | * You should have received a copy of the GNU General Public License |
14 | * along with this program; if not, write to the Free Software |
14 | * along with this program; if not, write to the Free Software |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
16 | */ |
16 | */ |
17 | 17 | ||
18 |
|
18 | /* Versatile translation according to a dictionary */ |
19 | 19 | ||
20 | /*************** Customization: change values hereafter ****************/ |
20 | /*************** Customization: change values hereafter ****************/ |
21 | 21 | ||
22 |
|
22 | /* limit of dictionary entries */ |
23 | #define entrylim 32768 |
23 | #define entrylim 32768 |
24 |
|
24 | /* limit of dictionary length */ |
25 | #define diclim |
25 | #define diclim 1024*1024 |
26 | 26 | ||
27 | /***************** Nothing should need change hereafter *****************/ |
27 | /***************** Nothing should need change hereafter *****************/ |
28 | 28 | ||
29 | #include "../Lib/basicstr.c" |
29 | #include "../Lib/basicstr.c" |
30 | 30 | ||
Line 53... | Line 53... | ||
53 | else k=strncmp((char*)entry[i1].original,s2,entry[i1].olen); |
53 | else k=strncmp((char*)entry[i1].original,s2,entry[i1].olen); |
54 | if(k==0 && (isalnum(*(s2+entry[i1].olen)) || (*(s2+entry[i1].olen)&128)!=0)) return -1; |
54 | if(k==0 && (isalnum(*(s2+entry[i1].olen)) || (*(s2+entry[i1].olen)&128)!=0)) return -1; |
55 | else return k; |
55 | else return k; |
56 | } |
56 | } |
57 | 57 | ||
58 |
|
58 | /* searches a list. Returns index if found, -1 if nomatch. |
59 |
|
59 | * Uses binary search, list must be sorted. */ |
60 | int search_list(struct entry *list, int items, size_t item_size, const char *str) |
60 | int search_list(struct entry *list, int items, size_t item_size, const char *str) |
61 | { |
61 | { |
62 | int i1,i2,j,k,t,t1; |
62 | int i1,i2,j,k,t,t1; |
63 | unsigned char c; |
63 | unsigned char c; |
64 | 64 | ||
Line 67... | Line 67... | ||
67 | k=list[0].original[0]-c; if(k==0) k=compare(0,str); |
67 | k=list[0].original[0]-c; if(k==0) k=compare(0,str); |
68 | if(k==0) goto more; if(k>0) return -1; |
68 | if(k==0) goto more; if(k>0) return -1; |
69 | j=items-1; k=list[j].original[0]-c; if(k==0) k=compare(j,str); |
69 | j=items-1; k=list[j].original[0]-c; if(k==0) k=compare(j,str); |
70 | if(k==0) return j; |
70 | if(k==0) return j; |
71 | if(k>0) for(i1=0,i2=j;i2>i1+1;) { |
71 | if(k>0) for(i1=0,i2=j;i2>i1+1;) { |
72 |
|
72 | j=i1+(i2-i1)/2; |
73 |
|
73 | k=list[j].original[0]-c; if(k==0) k=compare(j,str); |
74 |
|
74 | if(k==0) goto more; |
75 |
|
75 | if(k>0) {i2=j; continue;} |
76 |
|
76 | if(k<0) {i1=j; continue;} |
77 | } |
77 | } |
78 | if(k>0) {j--;k=compare(j,str);} |
78 | if(k>0) {j--;k=compare(j,str);} |
79 | more: |
79 | more: |
80 | if((t=list[j].earlier)<0) { |
80 | if((t=list[j].earlier)<0) { |
81 |
|
81 | if(k==0) return j; else return -1; |
82 | } |
82 | } |
83 | if(compare(t,str)!=0) return -1; |
83 | if(compare(t,str)!=0) return -1; |
84 | for(j=t1=t,k=0;j<items && list[j].earlier==t1 && (k=compare(j,str))<=0; j++) |
84 | for(j=t1=t,k=0;j<items && list[j].earlier==t1 && (k=compare(j,str))<=0; j++) |
85 | if(k==0) t=j; |
85 | if(k==0) t=j; |
86 | return t; |
86 | return t; |
87 | } |
87 | } |
88 | 88 | ||
89 |
|
89 | /* change all spaces into ' ', and collapse multiple occurences */ |
90 | void singlespace(char *p) |
90 | void singlespace(char *p) |
91 | { |
91 | { |
92 | char *pp, *p2; |
92 | char *pp, *p2; |
93 | for(pp=p;*pp;pp++) { |
93 | for(pp=p;*pp;pp++) { |
94 |
|
94 | if(!isspace(*pp)) continue; |
95 |
|
95 | if(leaveline) { |
96 |
|
96 | if(*pp==13) ovlstrcpy(pp,pp+1); |
97 |
|
97 | if(*pp=='\n') { |
98 |
|
98 | pp++; |
99 |
|
99 | gopt: for(p2=pp; isspace(*p2) && *p2!='\n'; p2++); |
100 |
|
100 | if(p2>pp) ovlstrcpy(pp,p2); pp--; |
101 |
|
101 | } |
102 |
|
102 | else { |
103 |
|
103 | pp++; if(!isspace(*pp) || *pp=='\n') continue; |
104 |
|
104 | goto gopt; |
105 |
|
105 | } |
106 |
|
106 | } |
107 |
|
107 | else { |
108 |
|
108 | if(*pp!=' ') *pp=' '; |
109 |
|
109 | if(!isspace(*(pp+1))) continue; |
110 |
|
110 | for(pp++,p2=pp;isspace(*p2);p2++); |
111 |
|
111 | ovlstrcpy(pp,p2); pp--; |
112 |
|
112 | } |
113 | } |
113 | } |
114 | } |
114 | } |
115 | 115 | ||
116 | #include "suffix.c" |
116 | #include "suffix.c" |
117 | 117 | ||
118 |
|
118 | /* Prepare dictionary */ |
119 | void prepare_dic(char *fname) |
119 | void prepare_dic(char *fname) |
120 | { |
120 | { |
121 | int i,l; |
121 | int i,l; |
122 | FILE *dicf; |
122 | FILE *dicf; |
123 | char *p1, *p2, *pp; |
123 | char *p1, *p2, *pp; |
124 | long int flen; |
124 | long int flen; |
125 | 125 | ||
126 | entrycount=0; |
126 | entrycount=0; |
127 | dicf=fopen(fname,"r"); if(dicf==NULL) return; |
127 | dicf=fopen(fname,"r"); if(dicf==NULL) return; |
128 | fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET); |
128 | fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET); |
129 | if(flen>diclim) return; |
129 | if(flen>diclim) return; |
130 | dicbuf=xmalloc(flen+16);flen=fread(dicbuf,1,flen,dicf); |
130 | dicbuf=xmalloc(flen+16);flen=fread(dicbuf,1,flen,dicf); |
131 | fclose(dicf); |
131 | fclose(dicf); |
132 | if(flen>0 && flen<diclim) dicbuf[flen]=0; |
132 | if(flen>0 && flen<diclim) dicbuf[flen]=0; |
133 | else return; |
133 | else return; |
134 | for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) { |
134 | for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) { |
135 |
|
135 | p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0; |
136 |
|
136 | pp=strchr(p1,':'); if(pp==NULL) continue; |
137 |
|
137 | *pp++=0; |
138 |
|
138 | strip_trailing_spaces(p1); strip_trailing_spaces(pp); |
139 |
|
139 | singlespace(p1); |
140 |
|
140 | p1=find_word_start(p1); pp=find_word_start(pp); |
141 |
|
141 | if(*p1==0) continue; |
142 |
|
142 | if(has_digits==0) { |
143 |
|
143 | char *p; |
144 |
|
144 | for(p=p1;*p!=0 && p<pp && !isdigit(*p);p++); |
145 |
|
145 | if(isdigit(*p)) has_digits=1; |
146 |
|
146 | } |
147 |
|
147 | entry[i].original=(unsigned char*)p1; |
148 |
|
148 | entry[i].replace=(unsigned char*)pp; |
149 |
|
149 | entry[i].olen=l=strlen(p1); entry[i].earlier=-1; |
150 |
|
150 | if(i>0) { |
151 |
|
151 | int l1,l2; |
152 |
|
152 | l1=entry[i-1].earlier; if(l1>=0) l2=entry[l1].olen; |
153 |
|
153 | else {l2=entry[i-1].olen;l1=i-1;} |
154 |
|
154 | if(l>l2 && isspace(p1[l2]) |
155 |
|
155 | && strncmp((char*)entry[l1].original,p1,l2)==0) |
156 |
|
156 | entry[i].earlier=entry[i-1].earlier=l1; |
157 |
|
157 | } |
158 |
|
158 | i++; |
159 | } |
159 | } |
160 | entrycount=i; |
160 | entrycount=i; |
161 | } |
161 | } |
162 | 162 | ||
163 |
|
163 | /* now make the translation. */ |
164 | void translate(char *p) |
164 | void translate(char *p) |
165 | { |
165 | { |
166 | char *p1, *p2, *pp; |
166 | char *p1, *p2, *pp; |
167 | int t; |
167 | int t; |
168 | 168 | ||
169 | if(entrycount<=0 && suffixcnt<=0) return; |
169 | if(entrycount<=0 && suffixcnt<=0) return; |
170 | snprintf(outbuf,sizeof(outbuf),"%s",p); |
170 | snprintf(outbuf,sizeof(outbuf),"%s",p); |
171 | for(p1=find_word_start(outbuf); |
171 | for(p1=find_word_start(outbuf); |
172 |
|
172 | p1!=NULL && p1-outbuf<MAX_LINELEN && *p1!=0; |
173 |
|
173 | p1=p2) { |
174 |
|
174 | p2=find_word_end(p1); |
175 |
|
175 | for(pp=p1;pp<p2 && |
176 |
|
176 | ((!has_digits && isalpha(*pp)) || |
177 |
|
177 | (has_digits && isalnum(*pp)) || (*pp&128)!=0 || |
178 |
|
178 | strchr("_",*pp)!=NULL);pp++); |
179 |
|
179 | p2=find_word_start(p2); |
180 |
|
180 | if(pp==p1 || |
181 |
|
181 | (has_digits==0 && isdigit(*pp)) || |
182 |
|
182 | (*pp!=0 && !isspace(*pp) && strchr(",.?!/;",*pp)==NULL)) continue; |
183 |
|
183 | t=search_list(entry,entrycount,sizeof(entry[0]),p1); |
184 |
|
184 | if(t<0) { |
185 |
|
185 | switch(unknown_type) { |
186 |
|
186 | case unk_leave: break; |
187 |
|
187 | case unk_delete: { |
188 |
|
188 | ovlstrcpy(p1,find_word_start(pp)); p2=p1; |
189 |
|
189 | break; |
190 |
|
190 | } |
191 |
|
191 | case unk_replace: { |
192 |
|
192 | string_modify(outbuf,p1,pp,unkbuf); |
193 |
|
193 | p2=find_word_start(p1+strlen(unkbuf)); |
194 |
|
194 | } |
195 |
|
195 | } |
196 |
|
196 | continue; |
197 |
|
197 | } |
198 |
|
198 | string_modify(outbuf,p1,p1+strlen((char*)entry[t].original), |
199 |
|
199 | (char*)entry[t].replace); |
200 |
|
200 | p2=find_word_start(p1+strlen((char*)entry[t].replace)); |
201 | } |
201 | } |
202 | snprintf(p,MAX_LINELEN,"%s",outbuf); |
202 | snprintf(p,MAX_LINELEN,"%s",outbuf); |
203 | } |
203 | } |
204 | 204 |