Rev 3718 | Rev 8100 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 3718 | Rev 7676 | ||
---|---|---|---|
Line 13... | Line 13... | ||
13 | * You should have received a copy of the GNU General Public License |
13 | * You should have received a copy of the GNU General Public License |
14 | * along with this program; if not, write to the Free Software |
14 | * along with this program; if not, write to the Free Software |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
16 | */ |
16 | */ |
17 | 17 | ||
18 |
|
18 | /* Sort dictionary */ |
19 | 19 | ||
20 | /*************** Customization: change values hereafter ****************/ |
20 | /*************** Customization: change values hereafter ****************/ |
21 | 21 | ||
22 |
|
22 | /* limit of dictionary entries */ |
23 | #define entrylim 512*1024 |
23 | #define entrylim 512*1024 |
24 |
|
24 | /* limit of dictionary length */ |
25 | #define diclim |
25 | #define diclim 32*1024*1024 |
26 |
|
26 | /* separation character */ |
27 | char sepchar=':', grpchar=0; |
27 | char sepchar=':', grpchar=0; |
28 | 28 | ||
29 | /***************** Nothing should need change hereafter *****************/ |
29 | /***************** Nothing should need change hereafter *****************/ |
30 | 30 | ||
31 | #include "../wims.h" |
31 | #include "../wims.h" |
Line 50... | Line 50... | ||
50 | p=malloc(n); |
50 | p=malloc(n); |
51 | if(p==NULL) exit(1); |
51 | if(p==NULL) exit(1); |
52 | return p; |
52 | return p; |
53 | } |
53 | } |
54 | 54 | ||
55 |
|
55 | /* Points to the end of the word */ |
56 | char *find_word_end(char *p) |
56 | char *find_word_end(char *p) |
57 | { |
57 | { |
58 | int i; |
58 | int i; |
59 | for(i=0;!isspace(*p) && *p!=0 && i<MAX_LINELEN; p++,i++); |
59 | for(i=0;!isspace(*p) && *p!=0 && i<MAX_LINELEN; p++,i++); |
60 | return p; |
60 | return p; |
61 | } |
61 | } |
62 | 62 | ||
63 |
|
63 | /* Strips leading spaces */ |
64 | char *find_word_start(char *p) |
64 | char *find_word_start(char *p) |
65 | { |
65 | { |
66 | int i; |
66 | int i; |
67 | for(i=0; isspace(*p) && i<MAX_LINELEN; p++,i++); |
67 | for(i=0; isspace(*p) && i<MAX_LINELEN; p++,i++); |
68 | return p; |
68 | return p; |
69 | } |
69 | } |
70 | 70 | ||
71 |
|
71 | /* strip trailing spaces; return string end. */ |
72 | char *strip_trailing_spaces(char *p) |
72 | char *strip_trailing_spaces(char *p) |
73 | { |
73 | { |
74 | char *pp; |
74 | char *pp; |
75 | if(*p==0) return p; |
75 | if(*p==0) return p; |
76 | for(pp=p+strlen(p)-1; pp>=p && isspace(*pp); *(pp--)=0); |
76 | for(pp=p+strlen(p)-1; pp>=p && isspace(*pp); *(pp--)=0); |
Line 88... | Line 88... | ||
88 | void sortdic(void) |
88 | void sortdic(void) |
89 | { |
89 | { |
90 | qsort(entry,entrycount,sizeof(entry[0]),compare); |
90 | qsort(entry,entrycount,sizeof(entry[0]),compare); |
91 | } |
91 | } |
92 | 92 | ||
93 |
|
93 | /* modify a string. Bufferlen must be ast least MAX_LINELEN */ |
94 | void string_modify(char *start, char *bad_beg, char *bad_end, char *good,...) |
94 | void string_modify(char *start, char *bad_beg, char *bad_end, char *good,...) |
95 | { |
95 | { |
96 | char buf[MAX_LINELEN+1]; |
96 | char buf[MAX_LINELEN+1]; |
97 | va_list vp; |
97 | va_list vp; |
98 | 98 | ||
99 | va_start(vp,good); |
99 | va_start(vp,good); |
100 | vsnprintf(buf,sizeof(buf),good,vp); va_end(vp); |
100 | vsnprintf(buf,sizeof(buf),good,vp); va_end(vp); |
101 | if(strlen(start)-(bad_end-bad_beg)+strlen(buf)>=MAX_LINELEN) |
101 | if(strlen(start)-(bad_end-bad_beg)+strlen(buf)>=MAX_LINELEN) |
102 | return; |
102 | return; /* this is an error situation. */ |
103 | strcat(buf,bad_end); |
103 | strcat(buf,bad_end); |
104 | ovlstrcpy(bad_beg,buf); |
104 | ovlstrcpy(bad_beg,buf); |
105 | } |
105 | } |
106 | 106 | ||
107 |
|
107 | /* change all spaces into ' ', and collapse multiple occurences */ |
108 | void singlespace(char *p) |
108 | void singlespace(char *p) |
109 | { |
109 | { |
110 | char *pp, *p2; |
110 | char *pp, *p2; |
111 | for(pp=p;*pp;pp++) { |
111 | for(pp=p;*pp;pp++) { |
112 |
|
112 | if(!isspace(*pp)) continue; |
113 |
|
113 | if(leaveline) { |
114 |
|
114 | if(*pp==13) ovlstrcpy(pp,pp+1); |
115 |
|
115 | if(*pp=='\n') { |
116 |
|
116 | pp++; |
117 |
|
117 | gopt: for(p2=pp; isspace(*p2) && *p2!='\n'; p2++); |
118 |
|
118 | if(p2>pp) ovlstrcpy(pp,p2); pp--; |
119 |
|
119 | } |
120 |
|
120 | else { |
121 |
|
121 | pp++; if(!isspace(*pp) || *pp=='\n') continue; |
122 |
|
122 | goto gopt; |
123 |
|
123 | } |
124 |
|
124 | } |
125 |
|
125 | else { |
126 |
|
126 | if(*pp!=' ') *pp=' '; |
127 |
|
127 | pp++; if(!isspace(*pp)) continue; |
128 |
|
128 | for(p2=pp;isspace(*p2);p2++); |
129 |
|
129 | ovlstrcpy(pp,p2); pp--; |
130 |
|
130 | } |
131 | } |
131 | } |
132 | } |
132 | } |
133 | 133 | ||
134 |
|
134 | /* Prepare dictionary */ |
135 | void prepare_dic(void) |
135 | void prepare_dic(void) |
136 | { |
136 | { |
137 | int i; |
137 | int i; |
138 | FILE *dicf; |
138 | FILE *dicf; |
139 | char *p1, *p2, *pp; |
139 | char *p1, *p2, *pp; |
140 | long int flen; |
140 | long int flen; |
141 | 141 | ||
142 | entrycount=0; |
142 | entrycount=0; |
143 | dicf=fopen(dicname,"r"); if(dicf==NULL) return; |
143 | dicf=fopen(dicname,"r"); if(dicf==NULL) return; |
144 | fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET); |
144 | fseek(dicf,0,SEEK_END);flen=ftell(dicf); fseek(dicf,0,SEEK_SET); |
145 | if(flen>diclim) return; |
145 | if(flen>diclim) return; |
146 | dicbuf=xmalloc(2*flen+1024);flen=fread(dicbuf,1,flen,dicf); |
146 | dicbuf=xmalloc(2*flen+1024);flen=fread(dicbuf,1,flen,dicf); |
147 | fclose(dicf); |
147 | fclose(dicf); |
148 | if(flen>0 && flen<diclim) dicbuf[flen]=0; |
148 | if(flen>0 && flen<diclim) dicbuf[flen]=0; |
149 | else return; |
149 | else return; |
150 | for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) { |
150 | for(i=0,p1=dicbuf;p1!=NULL && *p1!=0 && i<entrylim;p1=p2) { |
151 |
|
151 | p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0; |
152 |
|
152 | pp=strchr(p1,sepchar); if(pp==NULL) continue; |
153 |
|
153 | *pp++=0; |
154 |
|
154 | strip_trailing_spaces(p1); strip_trailing_spaces(pp); |
155 |
|
155 | singlespace(p1); |
156 |
|
156 | p1=find_word_start(p1); pp=find_word_start(pp); |
157 |
|
157 | if(*p1==0) continue; |
158 |
|
158 | entry[i].original=p1; entry[i].replace=pp; i++; |
159 | } |
159 | } |
160 | entrycount=i; |
160 | entrycount=i; |
161 | } |
161 | } |
162 | 162 | ||
163 | #include "suffix.c" |
163 | #include "suffix.c" |
164 | 164 | ||
165 | void output(void) |
165 | void output(void) |
166 | { |
166 | { |
167 | int i; |
167 | int i; |
168 | FILE *f; |
168 | FILE *f; |
169 | 169 | ||
170 | ocount=0; |
170 | ocount=0; |
171 | strcat(dicname,".sorted"); |
171 | strcat(dicname,".sorted"); |
172 | f=fopen(dicname,"w"); if(f==NULL) return; |
172 | f=fopen(dicname,"w"); if(f==NULL) return; |
173 | for(i=0;i<entrycount;i++) { |
173 | for(i=0;i<entrycount;i++) { |
174 |
|
174 | if(i>0 && strcmp(entry[i].original,entry[i-1].original)==0 |
175 |
|
175 | && strcmp(entry[i].replace,entry[i-1].replace)==0) |
176 |
|
176 | continue; |
177 |
|
177 | if(grpchar!=0) { |
178 |
|
178 | if(i>0 && strcmp(entry[i].original,entry[i-1].original)==0) |
179 |
|
179 | fprintf(f,"%c%s",grpchar, entry[i].replace); |
180 |
|
180 | else { |
181 |
|
181 | if(i>0) fprintf(f,"\n"); |
182 |
|
182 | fprintf(f,"%s%c%s",entry[i].original,sepchar,entry[i].replace); |
183 |
|
183 | ocount++; |
184 |
|
184 | } |
- | 185 | ||
185 | 186 | } |
|
186 | } |
- | |
187 |
|
187 | else { |
188 |
|
188 | fprintf(f,"%s%c%s\n",entry[i].original,sepchar,entry[i].replace); |
189 |
|
189 | ocount++; |
190 |
|
190 | } |
191 | } |
191 | } |
192 | if(grpchar!=0) fprintf(f,"\n"); |
192 | if(grpchar!=0) fprintf(f,"\n"); |
193 | fclose(f); |
193 | fclose(f); |
194 | } |
194 | } |
195 | 195 | ||
196 | int main(int argc, char *argv[]) |
196 | int main(int argc, char *argv[]) |
197 | { |
197 | { |
198 | char *ss, *gr; |
198 | char *ss, *gr; |
199 | if(argc<2) return -1; |
199 | if(argc<2) return -1; |
200 | 200 | ||
201 | ss=getenv("dicsort_separator"); |
201 | ss=getenv("dicsort_separator"); |
202 | if(ss!=NULL && *ss!=0) sepchar=*ss; |
202 | if(ss!=NULL && *ss!=0) sepchar=*ss; |
203 | gr=getenv("dicsort_grouping"); |
203 | gr=getenv("dicsort_grouping"); |
204 | if(gr!=NULL && *gr!=0) grpchar=*gr; |
204 | if(gr!=NULL && *gr!=0) grpchar=*gr; |
205 | snprintf(dicname,sizeof(dicname)-128,"%s",argv[1]); prepare_dic(); |
205 | snprintf(dicname,sizeof(dicname)-128,"%s",argv[1]); prepare_dic(); |
206 | if(argc>2) { |
206 | if(argc>2) { |
207 |
|
207 | snprintf(suffixname,sizeof(suffixname),"%s",argv[2]); |
208 |
|
208 | suffix_dic(suffixname); hassuffix=1; |
209 | } |
209 | } |
210 | else suffixname[0]=hassuffix=0; |
210 | else suffixname[0]=hassuffix=0; |
211 | sortdic(); output(); |
211 | sortdic(); output(); |
212 | printf("%s: sorted %d entries.\n",dicname, ocount); |
212 | printf("%s: sorted %d entries.\n",dicname, ocount); |
213 | return 0; |
213 | return 0; |