Rev 6564 | Rev 6806 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
10 | reyssat | 1 | /* Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis |
2 | * |
||
3 | * This program is free software; you can redistribute it and/or modify |
||
4 | * it under the terms of the GNU General Public License as published by |
||
5 | * the Free Software Foundation; either version 2 of the License, or |
||
6 | * (at your option) any later version. |
||
7 | * |
||
8 | * This program is distributed in the hope that it will be useful, |
||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
11 | * GNU General Public License for more details. |
||
12 | * |
||
13 | * You should have received a copy of the GNU General Public License |
||
14 | * along with this program; if not, write to the Free Software |
||
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
||
16 | */ |
||
17 | |||
18 | /* This is an internal program, |
||
19 | * used to index modules for search engine. */ |
||
20 | |||
21 | #include "../wims.h" |
||
3718 | reyssat | 22 | #include "../Lib/basicstr.c" |
10 | reyssat | 23 | |
24 | #define MAX_LANGS MAX_LANGUAGES |
||
25 | #define MAX_MODULES 65536 |
||
26 | char *moduledir= "public_html/modules"; |
||
27 | char *sheetdir= "public_html/bases/sheet"; |
||
28 | char *dicdir= "public_html/bases"; |
||
29 | char *outdir= "public_html/bases/site2"; |
||
30 | char *maindic= "sys/words"; |
||
31 | char *groupdic= "sys/wgrp/wgrp"; |
||
32 | char *suffixdic= "sys/suffix"; |
||
33 | char *ignoredic= "sys/indignore"; |
||
34 | char *conffile= "log/wims.conf"; |
||
35 | char *mlistbase= "list"; |
||
36 | |||
37 | char lang[MAX_LANGS][4]={ |
||
1792 | bpr | 38 | "en","fr","cn","es","it","nl","si","ca","pt" |
10 | reyssat | 39 | }; |
40 | #define DEFAULT_LANGCNT 6 |
||
41 | char allang[MAX_LANGS][4]={ |
||
6564 | bpr | 42 | "en","fr","cn","es","it","nl","de","si","ca","pt" |
10 | reyssat | 43 | }; |
44 | #define allangcnt 8 |
||
45 | char ignore[MAX_LANGS][MAX_LINELEN+1]; |
||
46 | char mlistfile[MAX_LANGS][256]; |
||
47 | int langcnt; |
||
48 | FILE *langf, *titf, *descf, *weightf, *robotf, *indf, *listf, *addrf, *serialf, *authorf, *versionf; |
||
49 | |||
50 | struct cat { |
||
51 | char *name; |
||
52 | char typ; |
||
53 | } cat[]={ |
||
54 | {"all_types", 'A'}, |
||
55 | {"exercise", 'X'}, |
||
56 | {"oef", 'O'}, |
||
57 | {"tool", 'T'}, |
||
58 | {"recreation", 'R'}, |
||
59 | {"reference", 'Y'}, |
||
60 | {"document", 'D'}, |
||
61 | {"popup", 'P'}, |
||
62 | {"datamodule", 'M'} |
||
63 | }; |
||
64 | #define catno (sizeof(cat)/sizeof(cat[0])) |
||
65 | |||
66 | struct mod { |
||
67 | char *name; |
||
68 | unsigned char langs[MAX_LANGS]; |
||
69 | int counts[MAX_LANGS]; |
||
70 | int langcnt; |
||
71 | } mod[MAX_MODULES]; |
||
72 | int modcnt; |
||
73 | |||
74 | char *mlist; |
||
75 | |||
76 | void *xmalloc(size_t n) |
||
77 | { |
||
78 | void *p; |
||
79 | p=malloc(n); |
||
80 | if(p==NULL) { |
||
81 | printf("Malloc failure.\n"); |
||
82 | exit(1); |
||
83 | } |
||
84 | return p; |
||
85 | } |
||
86 | |||
87 | char *acctab="çéèêëúùûüáàâäãóòôöõíìïîñýÇÉÈÊËÚÙÛÜÁÀÂÃÄÓÒÔÖÕÍÌÏÎÑÝ", |
||
88 | *deatab="ceeeeuuuuaaaaaoooooiiiinyCEEEEUUUUAAAAAOOOOOIIIINY"; |
||
89 | |||
90 | /* fold accented letters to unaccented */ |
||
91 | void deaccent(char *p) |
||
92 | { |
||
3247 | bpr | 93 | char *sp; |
10 | reyssat | 94 | char *v; |
95 | for(sp=p;*sp;sp++) { |
||
96 | if(*sp<0 && (v=strchr(acctab,*sp))!=NULL) |
||
97 | *sp=*(deatab+(v-acctab)); |
||
98 | if(!isalnum(*sp) && strchr(",.&$+*",*sp)==0) *sp=' '; |
||
99 | else *sp=tolower(*sp); |
||
100 | } |
||
101 | } |
||
102 | |||
103 | /* translate everything non-alphanumeric into space */ |
||
104 | void towords(char *p) |
||
105 | { |
||
106 | char *pp; |
||
107 | for(pp=p;*pp;pp++) if(!isalnum(*pp) && strchr("&$+*",*pp)==0) *pp=' '; |
||
108 | } |
||
109 | |||
110 | /* Points to the end of the word */ |
||
111 | char *find_word_end(char *p) |
||
112 | { |
||
113 | int i; |
||
114 | for(i=0;!isspace(*p) && *p!=0 && i<MAX_LINELEN; p++,i++); |
||
115 | return p; |
||
116 | } |
||
117 | |||
118 | /* Strips leading spaces */ |
||
119 | char *find_word_start(char *p) |
||
120 | { |
||
121 | int i; |
||
122 | for(i=0; isspace(*p) && i<MAX_LINELEN; p++,i++); |
||
123 | return p; |
||
124 | } |
||
125 | |||
126 | /* Find first occurrence of word */ |
||
127 | char *wordchr(char *p, char *w) |
||
128 | { |
||
129 | char *r; |
||
130 | |||
131 | for(r=strstr(p,w);r!=NULL && |
||
132 | ( (r>p && !isspace(*(r-1))) || (!isspace(*(r+strlen(w))) && *(r+strlen(w))!=0) ); |
||
133 | r=strstr(r+1,w)); |
||
134 | return r; |
||
135 | } |
||
136 | |||
137 | /* find a variable in a string (math expression). |
||
138 | * Returns the pointer or NULL. */ |
||
139 | char *varchr(char *p, char *v) |
||
140 | { |
||
141 | char *pp; int n=strlen(v); |
||
142 | for(pp=strstr(p,v); pp!=NULL; pp=strstr(pp+1,v)) { |
||
143 | if((pp==p || !isalnum(*(pp-1))) && |
||
144 | (!isalnum(*(pp+n)) || *(pp+n)==0)) break; |
||
145 | } |
||
146 | return pp; |
||
147 | } |
||
148 | |||
149 | /* strip trailing spaces; return string end. */ |
||
150 | char *strip_trailing_spaces(char *p) |
||
151 | { |
||
152 | char *pp; |
||
153 | if(*p==0) return p; |
||
154 | for(pp=p+strlen(p)-1; pp>=p && isspace(*pp); *(pp--)=0); |
||
155 | return pp; |
||
156 | } |
||
157 | |||
158 | char *find_tag_end(char *p) |
||
159 | { |
||
160 | char *pp; |
||
161 | pp=p; if(*pp=='<') pp++; |
||
162 | for(; *pp && *pp!='>'; pp++) { |
||
163 | if(*pp=='<') { |
||
164 | pp=find_tag_end(pp)-1; continue; |
||
165 | } |
||
166 | if(*pp=='"') { |
||
167 | pp=strchr(pp+1,'"'); |
||
168 | if(pp==NULL) return p+strlen(p); else continue; |
||
169 | } |
||
170 | if(*pp=='\'') { |
||
171 | pp=strchr(pp+1,'\''); |
||
172 | if(pp==NULL) return p+strlen(p); else continue; |
||
173 | } |
||
174 | } |
||
175 | if(*pp=='>') pp++; return pp; |
||
176 | } |
||
177 | |||
178 | char *find_tag(char *p, char *tag) |
||
179 | { |
||
180 | char *pp; |
||
181 | int len; |
||
182 | len=strlen(tag); |
||
183 | for(pp=strchr(p,'<'); pp!=NULL && *pp; pp=strchr(pp+1,'<')) { |
||
184 | if(strncasecmp(pp+1,tag,len)==0 && !isalnum(*(pp+1+len))) return pp; |
||
185 | } |
||
186 | return p+strlen(p); |
||
187 | } |
||
188 | |||
189 | /* remove all html tags */ |
||
190 | void detag(char *p) |
||
191 | { |
||
192 | char *pp, *p2; |
||
193 | for(pp=strchr(p,'<'); pp!=NULL; pp=strchr(pp,'<')) { |
||
194 | p2=find_tag_end(pp); |
||
195 | if(*p2==0) {*pp=0; return; } |
||
3718 | reyssat | 196 | ovlstrcpy(pp,p2); |
10 | reyssat | 197 | } |
198 | } |
||
199 | |||
200 | /* modify a string. Bufferlen must be ast least MAX_LINELEN */ |
||
201 | void string_modify(char *start, char *bad_beg, char *bad_end, char *good,...) |
||
202 | { |
||
203 | char buf[MAX_LINELEN+1]; |
||
204 | va_list vp; |
||
205 | |||
206 | va_start(vp,good); |
||
207 | vsnprintf(buf,sizeof(buf),good,vp); va_end(vp); |
||
208 | if(strlen(start)-(bad_end-bad_beg)+strlen(buf)>=MAX_LINELEN) |
||
209 | return; |
||
210 | strcat(buf,bad_end); |
||
3718 | reyssat | 211 | ovlstrcpy(bad_beg,buf); |
10 | reyssat | 212 | } |
213 | |||
214 | void _getdef(char buf[], char *name, char value[]) |
||
215 | { |
||
216 | char *p1, *p2, *p3; |
||
217 | |||
218 | value[0]=0; |
||
219 | for(p1=strstr(buf,name); p1!=NULL; p1=strstr(p1+1,name)) { |
||
220 | p2=find_word_start(p1+strlen(name)); |
||
221 | if((p1>buf && !isspace(*(p1-1))) || *p2!='=') continue; |
||
222 | p3=p1; while(p3>buf && isspace(*(p3-1)) && *(p3-1)!='\n') p3--; |
||
223 | if(p3>buf && *(p3-1)!='\n') continue; |
||
6564 | bpr | 224 | p3=strchr(p2,'\n'); |
10 | reyssat | 225 | p2=find_word_start(p2+1); |
6564 | bpr | 226 | if(p3 <= p2) continue; |
10 | reyssat | 227 | snprintf(value,MAX_LINELEN,"%s",p2); |
228 | if(p3!=NULL && p3-p2<MAX_LINELEN) value[p3-p2]=0; |
||
229 | strip_trailing_spaces(value); |
||
230 | break; |
||
231 | } |
||
232 | } |
||
233 | |||
234 | /* Get variable definition from a file. |
||
235 | * Result stored in buffer value of length MAX_LINELEN. */ |
||
236 | void getdef(char *fname, char *name, char value[]) |
||
237 | { |
||
238 | FILE *f; |
||
239 | char *buf; |
||
240 | int l; |
||
241 | |||
242 | value[0]=0; |
||
243 | f=fopen(fname,"r"); if(f==NULL) return; |
||
244 | fseek(f,0,SEEK_END); l=ftell(f); fseek(f,0,SEEK_SET); |
||
245 | buf=xmalloc(l+256); l=fread(buf,1,l,f); |
||
246 | fclose(f); |
||
247 | if(l<=0) return; else buf[l]=0; |
||
248 | _getdef(buf,name,value); |
||
249 | free(buf); |
||
250 | } |
||
251 | |||
252 | #include "translator_.c" |
||
253 | |||
254 | char *mdicbuf, *gdicbuf; |
||
255 | char gentry[sizeof(entry)], mentry[sizeof(entry)]; |
||
256 | int gentrycount, mentrycount; |
||
257 | |||
258 | /* Preparation of data */ |
||
259 | void prep(void) |
||
260 | { |
||
261 | char buf[MAX_LINELEN+1]; |
||
262 | char *p1,*p2,*s,*old; |
||
263 | int i,l,thislang,t; |
||
264 | FILE *f; |
||
265 | |||
266 | s=getenv("modind_outdir"); if(s!=NULL && *s!=0) outdir=s; |
||
267 | s=getenv("modind_sheetdir"); if(s!=NULL && *s!=0) sheetdir=s; |
||
268 | snprintf(buf,sizeof(buf),"%s/addr",outdir); |
||
269 | addrf=fopen(buf,"w"); |
||
270 | snprintf(buf,sizeof(buf),"%s/serial",outdir); |
||
271 | serialf=fopen(buf,"w"); |
||
272 | modcnt=langcnt=0; |
||
273 | getdef(conffile,"site_languages",buf); |
||
274 | for(p1=buf;*p1;p1++) if(!isalnum(*p1)) *p1=' '; |
||
275 | for(p1=find_word_start(buf); *p1 && langcnt<MAX_LANGS; p1=find_word_start(p2)) { |
||
276 | p2=find_word_end(p1); |
||
277 | if(p2!=p1+2 || !isalpha(*p1) || !isalpha(*(p1+1))) continue; |
||
278 | memmove(lang[langcnt],p1,2); lang[langcnt++][2]=0; |
||
279 | } |
||
280 | if(langcnt==0) { /* default languages */ |
||
281 | langcnt=DEFAULT_LANGCNT; |
||
282 | } |
||
283 | s=getenv("mlist"); if(s==NULL) exit(1); |
||
284 | l=strlen(s); if(l<0 || l>100*MAX_LINELEN) exit(1); |
||
3718 | reyssat | 285 | mlist=xmalloc(l+16); ovlstrcpy(mlist,s); old=""; |
10 | reyssat | 286 | for(i=0;i<langcnt;i++) { |
287 | snprintf(buf,sizeof(buf),"%s/%s.%s",dicdir,ignoredic,lang[i]); |
||
288 | f=fopen(buf,"r"); if(f==NULL) continue; |
||
289 | l=fread(ignore[i],1,MAX_LINELEN,f);fclose(f); |
||
290 | if(l<0 || l>=MAX_LINELEN) l=0; |
||
291 | ignore[i][l]=0; |
||
292 | } |
||
293 | for(t=0, p1=find_word_start(mlist); |
||
294 | *p1 && modcnt<MAX_MODULES; |
||
295 | p1=find_word_start(p2), t++) { |
||
296 | p2=find_word_end(p1); |
||
297 | l=p2-p1; if(*p2) *p2++=0; |
||
298 | fprintf(addrf,"%d:%s\n",t,p1); |
||
299 | fprintf(serialf,"%s:%d\n",p1,t); |
||
300 | thislang=-1; |
||
6564 | bpr | 301 | /* language is taken from the address */ |
10 | reyssat | 302 | if(l>3 && p1[l-3]=='.') { |
303 | for(i=0;i<langcnt;i++) if(strcasecmp(lang[i],p1+l-2)==0) break; |
||
304 | if(i<langcnt) {p1[l-3]=0; thislang=i;} |
||
305 | else { /* unknown language, not referenced */ |
||
306 | continue; |
||
307 | } |
||
308 | } |
||
309 | if(modcnt>0 && strcmp(old,p1)==0 && thislang>=0) { |
||
310 | if(mod[modcnt-1].langcnt<langcnt) { |
||
311 | mod[modcnt-1].langs[mod[modcnt-1].langcnt]=thislang; |
||
312 | mod[modcnt-1].counts[mod[modcnt-1].langcnt]=t; |
||
313 | (mod[modcnt-1].langcnt)++; |
||
314 | } |
||
315 | } |
||
316 | else { |
||
317 | mod[modcnt].name=old=p1; |
||
318 | if(thislang>=0) { |
||
319 | mod[modcnt].langs[0]=thislang; |
||
320 | mod[modcnt].langcnt=1; |
||
321 | } |
||
322 | else mod[modcnt].langcnt=0; |
||
323 | mod[modcnt].counts[0]=t; |
||
324 | modcnt++; |
||
325 | } |
||
326 | } |
||
327 | snprintf(buf,sizeof(buf),"%s/language",outdir); |
||
328 | langf=fopen(buf,"w"); |
||
329 | snprintf(buf,sizeof(buf),"%s/title",outdir); |
||
330 | titf=fopen(buf,"w"); |
||
331 | snprintf(buf,sizeof(buf),"%s/description",outdir); |
||
332 | descf=fopen(buf,"w"); |
||
333 | snprintf(buf,sizeof(buf),"%s/author",outdir); |
||
334 | authorf=fopen(buf,"w"); |
||
335 | snprintf(buf,sizeof(buf),"%s/version",outdir); |
||
336 | versionf=fopen(buf,"w"); |
||
337 | snprintf(buf,sizeof(buf),"%s/lists/robot.phtml",outdir); |
||
338 | robotf=fopen(buf,"w"); |
||
339 | fclose(addrf); fclose(serialf); |
||
340 | if(!robotf || !versionf || !authorf || !descf || !titf || !descf) { |
||
341 | fprintf(stderr,"modind: error creating output files.\n"); |
||
342 | exit(1); |
||
343 | } |
||
344 | } |
||
345 | |||
346 | void sprep(void) |
||
347 | { |
||
348 | char *p1,*p2,*s; |
||
349 | int i,l,thislang; |
||
350 | |||
351 | modcnt=0; |
||
352 | s=getenv("slist"); if(s==NULL) return; |
||
353 | l=strlen(s); if(l<0 || l>100*MAX_LINELEN) return; |
||
3718 | reyssat | 354 | mlist=xmalloc(l+16); ovlstrcpy(mlist,s); |
10 | reyssat | 355 | for(p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES; p1=find_word_start(p2)) { |
356 | p2=find_word_end(p1); |
||
357 | l=p2-p1; if(*p2) *p2++=0; |
||
358 | for(i=0;i<langcnt;i++) if(strncasecmp(lang[i],p1,2)==0) break; |
||
359 | if(i<langcnt) thislang=i; else continue; |
||
360 | mod[modcnt].name=p1; |
||
361 | mod[modcnt].langs[0]=thislang; |
||
362 | mod[modcnt].langcnt=1; |
||
363 | modcnt++; |
||
364 | } |
||
365 | } |
||
366 | |||
367 | void clean(void) |
||
368 | { |
||
369 | fclose(langf); fclose(titf); fclose(descf); fclose(robotf); |
||
370 | fclose(authorf); fclose(versionf); |
||
371 | } |
||
372 | |||
373 | char *sheetindex[]={ |
||
374 | "title", "description", |
||
375 | "duration", "severity", |
||
376 | "level", "domain", |
||
377 | "keywords", "reserved1", "reserved2", "remark" |
||
378 | }; |
||
379 | #define SHEETINDEX_NO (sizeof(sheetindex)/sizeof(sheetindex[0])) |
||
380 | char sindbuf[SHEETINDEX_NO][MAX_LINELEN+1]; |
||
381 | enum{s_title, s_description, |
||
382 | s_duration, s_severity, |
||
383 | s_level, s_domain, |
||
384 | s_keywords, s_reserved1, s_reserved2, |
||
385 | s_remark |
||
386 | }; |
||
387 | |||
388 | char *modindex[]={ |
||
389 | "title", "description", |
||
390 | "author", "address", "copyright", |
||
391 | "version", "wims_version", "language", |
||
6394 | bpr | 392 | "category", "level", "domain", "keywords", |
6799 | bpr | 393 | "keywords_ca", "keywords_en", "keywords_fr", "keywords_it", "keywords_nl", |
394 | "title_ca", "title_en", "title_fr", "title_it", "title_nl", |
||
10 | reyssat | 395 | "require" |
396 | }; |
||
397 | #define MODINDEX_NO (sizeof(modindex)/sizeof(modindex[0])) |
||
398 | char indbuf[MODINDEX_NO][MAX_LINELEN+1]; |
||
399 | enum{i_title, i_description, |
||
400 | i_author,i_address,i_copyright, |
||
401 | i_version,i_wims_version,i_language, |
||
402 | i_category,i_level,i_domain,i_keywords, |
||
6799 | bpr | 403 | i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl, |
404 | i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl, |
||
10 | reyssat | 405 | i_require |
406 | }; |
||
407 | |||
408 | char *module_special_file[]={ |
||
409 | "intro","help","about" |
||
410 | }; |
||
411 | #define MODSPEC_NO (sizeof(module_special_file)/sizeof(module_special_file[0])) |
||
412 | char module_language[4]; |
||
413 | |||
414 | /* read and treat module's INDEX file */ |
||
415 | int module_index(const char *name) |
||
416 | { |
||
417 | char *p, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1]; |
||
418 | FILE *indf; |
||
419 | int i,l; |
||
420 | |||
421 | snprintf(fbuf,sizeof(fbuf),"%s/%s/INDEX",moduledir,name); |
||
422 | indf=fopen(fbuf,"r"); if(indf==NULL) return -1; |
||
423 | l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf); |
||
424 | if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1; |
||
425 | for(i=0;i<MODINDEX_NO;i++) { |
||
426 | _getdef(ibuf,modindex[i],indbuf[i]); |
||
427 | /* compatibility precaution */ |
||
428 | if(indbuf[i][0]==':') indbuf[i][0]='.'; |
||
429 | } |
||
430 | p=find_word_start(indbuf[i_language]); |
||
431 | if(isalpha(*p) && isalpha(*(p+1))) { |
||
432 | memmove(module_language,p,2); module_language[2]=0; |
||
433 | } |
||
3718 | reyssat | 434 | else ovlstrcpy(module_language,"en"); |
10 | reyssat | 435 | return 0; |
436 | } |
||
437 | |||
438 | int sheet_index(int serial) |
||
439 | { |
||
440 | char *p1, *p2, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1]; |
||
441 | FILE *indf; |
||
442 | int i,l; |
||
443 | |||
444 | snprintf(fbuf,sizeof(fbuf),"%s/%s.def",sheetdir,mod[serial].name); |
||
445 | indf=fopen(fbuf,"r"); if(indf==NULL) return -1; |
||
446 | l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf); |
||
447 | if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1; |
||
448 | for(i=0;i<SHEETINDEX_NO;i++) sindbuf[i][0]=0; |
||
449 | for(i=0,p1=find_word_start(ibuf); |
||
450 | i<SHEETINDEX_NO-1 && *p1!=':' && *p1!=0; |
||
451 | i++,p1=p2) { |
||
452 | p2=strchr(p1,'\n'); |
||
453 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
454 | p1=find_word_start(p1); strip_trailing_spaces(p1); |
||
455 | snprintf(sindbuf[i],MAX_LINELEN,"%s",p1); |
||
456 | } |
||
457 | p2=strstr(p1,"\n:"); if(p2==NULL) p2=p1+strlen(p1); |
||
458 | else *p2=0; |
||
459 | p1=find_word_start(p1); strip_trailing_spaces(p1); |
||
460 | for(p2=p1;*p2;p2++) if(*p2=='\n') *p2=' '; |
||
3718 | reyssat | 461 | ovlstrcpy(sindbuf[s_remark],p1); |
10 | reyssat | 462 | return 0; |
463 | } |
||
464 | |||
465 | unsigned char categories[16]; |
||
466 | char taken[MAX_LINELEN+1]; |
||
467 | int catcnt, takenlen, tweight; |
||
468 | |||
469 | void appenditem(char *word, int lind, int serial, int weight, char *l) |
||
470 | { |
||
471 | char nbuf[MAX_LINELEN+1], buf[MAX_LINELEN+1]; |
||
472 | int i, ll; |
||
473 | char *p; |
||
474 | FILE *f; |
||
475 | |||
476 | if(!isalnum(*word) || (ll=strlen(word))<2 || |
||
477 | wordchr(taken,word)!=NULL || |
||
478 | wordchr(ignore[lind],word)!=NULL || |
||
479 | takenlen>=MAX_LINELEN-ll-16) |
||
480 | return; |
||
481 | if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return; |
||
482 | for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return; |
||
483 | taken[takenlen++]=' '; taken[takenlen++]=' '; |
||
3718 | reyssat | 484 | ovlstrcpy(taken+takenlen,word); |
10 | reyssat | 485 | takenlen+=ll; tweight+=weight; |
486 | snprintf(buf,sizeof(buf),"%s:%d?%d\n",word,serial,weight); |
||
487 | for(i=0;i<catcnt;i++) { |
||
488 | snprintf(nbuf,sizeof(nbuf),"%s/%c.%s", |
||
489 | outdir,categories[i],lang[lind]); |
||
490 | f=fopen(nbuf,"a"); |
||
491 | if(f!=NULL) {fputs(buf,f); fclose(f);} |
||
492 | } |
||
493 | } |
||
494 | |||
495 | void onemodule(const char *name, int serial, int lind) |
||
496 | { |
||
497 | int i; |
||
498 | unsigned char trlist[]={ |
||
499 | i_title,i_description,i_category,i_domain,i_keywords, |
||
6394 | bpr | 500 | i_require,i_author, |
6799 | bpr | 501 | i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl, |
502 | i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl |
||
10 | reyssat | 503 | }; |
504 | #define trcnt (sizeof(trlist)/sizeof(trlist[0])) |
||
6564 | bpr | 505 | char *p1, *p2, *pp, *q, buf[MAX_LINELEN+1], lbuf[16]; |
10 | reyssat | 506 | FILE *f; |
507 | |||
508 | if(module_index(name)) return; |
||
509 | towords(indbuf[i_category]); |
||
510 | for(i=catcnt=0;i<catno && catcnt<16;i++) { |
||
511 | if(wordchr(indbuf[i_category],cat[i].name)!=NULL) |
||
512 | categories[catcnt++]=cat[i].typ; |
||
513 | } |
||
514 | if(catcnt==0) return; |
||
515 | if(categories[0]!=cat[0].typ) |
||
516 | categories[catcnt++]=cat[0].typ; |
||
517 | for(i=0;i<catcnt;i++) { |
||
518 | snprintf(buf,sizeof(buf),"%s/lists/%c.%s", |
||
519 | outdir,categories[i],lang[lind]); |
||
520 | f=fopen(buf,"a"); |
||
521 | if(f!=NULL) {fprintf(f,"%s\n",name); fclose(f);} |
||
522 | } |
||
523 | fprintf(langf,"%d:%s\n",serial,module_language); |
||
524 | fprintf(titf,"%d:%s\n",serial,indbuf[i_title]); |
||
525 | fprintf(descf,"%d:%s\n",serial,indbuf[i_description]); |
||
526 | fprintf(authorf,"%d:%s\n",serial,indbuf[i_author]); |
||
527 | fprintf(versionf,"%d:%s\n",serial,indbuf[i_version]); |
||
528 | snprintf(buf,sizeof(buf),"%s",indbuf[i_description]); |
||
529 | for(pp=strchr(buf,','); pp; pp=strchr(pp,',')) |
||
530 | string_modify(buf,pp,pp+1,","); |
||
531 | if(strcmp(module_language,lang[lind])==0) |
||
532 | fprintf(robotf,"%s ,%s,%s,%s,%s\n",name,module_language,name, |
||
533 | indbuf[i_title], buf); |
||
534 | entrycount=mentrycount; dicbuf=mdicbuf; |
||
535 | memmove(entry,mentry,mentrycount*sizeof(entry[0])); |
||
536 | unknown_type=unk_leave; |
||
537 | for(i=0;i<trcnt;i++) { |
||
538 | detag(indbuf[trlist[i]]); |
||
539 | deaccent(indbuf[trlist[i]]); |
||
540 | singlespace(indbuf[trlist[i]]); |
||
541 | suffix_translate(indbuf[trlist[i]]); |
||
542 | translate(indbuf[trlist[i]]); |
||
543 | } |
||
544 | taken[0]=0; takenlen=tweight=0; |
||
3718 | reyssat | 545 | ovlstrcpy(buf,indbuf[i_title]); towords(buf); |
10 | reyssat | 546 | for(p1=find_word_start(buf);*p1; |
547 | p1=find_word_start(p2)) { |
||
548 | p2=find_word_end(p1); if(*p2) *p2++=0; |
||
549 | appenditem(p1,lind,serial,4,module_language); |
||
550 | } |
||
6799 | bpr | 551 | snprintf(buf,sizeof(buf),"%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s", |
10 | reyssat | 552 | indbuf[i_description],indbuf[i_keywords], |
6799 | bpr | 553 | indbuf[i_keywords_ca],indbuf[i_keywords_fr],indbuf[i_keywords_fr], |
6394 | bpr | 554 | indbuf[i_keywords_it],indbuf[i_keywords_nl], |
6799 | bpr | 555 | indbuf[i_title_ca],indbuf[i_title_fr],indbuf[i_title_fr], |
556 | indbuf[i_title_it],indbuf[i_title_nl], |
||
10 | reyssat | 557 | indbuf[i_domain],indbuf[i_require],indbuf[i_author]); |
558 | towords(buf); |
||
559 | for(p1=find_word_start(buf);*p1; |
||
560 | p1=find_word_start(p2)) { |
||
561 | p2=find_word_end(p1); if(*p2) *p2++=0; |
||
562 | appenditem(p1,lind,serial,2,module_language); |
||
563 | } |
||
564 | entrycount=gentrycount; dicbuf=gdicbuf; |
||
565 | memmove(entry,gentry,gentrycount*sizeof(entry[0])); |
||
566 | unknown_type=unk_delete; |
||
3718 | reyssat | 567 | ovlstrcpy(buf,indbuf[i_title]); translate(buf); |
10 | reyssat | 568 | for(p1=find_word_start(buf); *p1; |
569 | p1=find_word_start(p2)) { |
||
570 | p2=strchr(p1,','); |
||
571 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
572 | if(strlen(p1)<=0) continue; |
||
573 | appenditem(p1,lind,serial,4,module_language); |
||
574 | } |
||
6799 | bpr | 575 | snprintf(buf,sizeof(buf),"%s, %s, %s, %s, %s, %s, %s, %s", |
10 | reyssat | 576 | indbuf[i_description],indbuf[i_keywords], |
6799 | bpr | 577 | indbuf[i_keywords_ca], indbuf[i_keywords_en],indbuf[i_keywords_fr], |
6394 | bpr | 578 | indbuf[i_keywords_it], indbuf[i_keywords_nl], |
10 | reyssat | 579 | indbuf[i_domain]); |
580 | translate(buf); |
||
581 | for(p1=find_word_start(buf); *p1; |
||
582 | p1=find_word_start(p2)) { |
||
583 | p2=strchr(p1,','); |
||
584 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
585 | if(strlen(p1)<=0) continue; |
||
586 | appenditem(p1,lind,serial,2,module_language); |
||
587 | } |
||
588 | snprintf(buf,sizeof(buf),"%s",indbuf[i_level]); |
||
3718 | reyssat | 589 | ovlstrcpy(lbuf,"level"); |
10 | reyssat | 590 | for(p1=buf; *p1; p1++) if(!isalnum(*p1)) *p1=' '; |
6564 | bpr | 591 | q=buf+strlen(buf); |
592 | for(p1=find_word_start(buf); (*p1) && (p1 < q) ; |
||
10 | reyssat | 593 | p1=find_word_start(p2)) { |
594 | p2=find_word_end(p1); |
||
595 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
596 | if(!isalpha(*p1) || |
||
597 | (!isdigit(*(p1+1)) && *(p1+1)!=0) || |
||
598 | (*(p1+1)!=0 && *(p1+2)!=0)) |
||
599 | continue; |
||
600 | *p1=tolower(*p1); |
||
3718 | reyssat | 601 | ovlstrcpy(lbuf+strlen("level"),p1); |
10 | reyssat | 602 | appenditem(lbuf,lind,serial,2,module_language); |
603 | } |
||
604 | fprintf(weightf,"%d:%d\n",serial,tweight); |
||
605 | } |
||
606 | |||
607 | void modules(void) |
||
608 | { |
||
609 | int i,j,k,d; |
||
610 | char namebuf[MAX_LINELEN+1]; |
||
611 | char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1]; |
||
612 | |||
613 | for(j=0;j<langcnt;j++) { |
||
614 | snprintf(namebuf,sizeof(namebuf),"%s/weight.%s",outdir,lang[j]); |
||
615 | weightf=fopen(namebuf,"w"); |
||
616 | snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]); |
||
617 | snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]); |
||
618 | snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]); |
||
619 | suffix_dic(sdic); prepare_dic(gdic); |
||
620 | gdicbuf=dicbuf; gentrycount=entrycount; |
||
621 | memmove(gentry,entry,gentrycount*sizeof(entry[0])); |
||
622 | prepare_dic(mdic); |
||
623 | mdicbuf=dicbuf; mentrycount=entrycount; |
||
624 | memmove(mentry,entry,mentrycount*sizeof(entry[0])); |
||
625 | unknown_type=unk_leave; translate(ignore[j]); |
||
626 | for(i=0;i<modcnt;i++) { |
||
627 | if(mod[i].langcnt>0) { |
||
628 | for(d=k=0;k<mod[i].langcnt;k++) |
||
629 | if(mod[i].langs[k]<mod[i].langs[d]) d=k; |
||
630 | for(k=0;k<mod[i].langcnt && mod[i].langs[k]!=j;k++); |
||
631 | if(k>=mod[i].langcnt) k=d; |
||
632 | snprintf(namebuf,MAX_LINELEN,"%s.%s",mod[i].name, |
||
633 | lang[mod[i].langs[k]]); |
||
634 | onemodule(namebuf,mod[i].counts[k],j); |
||
635 | } |
||
636 | else { |
||
637 | onemodule(mod[i].name,mod[i].counts[0],j); |
||
638 | } |
||
639 | } |
||
640 | if(mentrycount>0) free(mdicbuf); |
||
641 | if(gentrycount>0) free(gdicbuf); |
||
642 | if(suffixcnt>0) free(sufbuf); |
||
643 | if(weightf) fclose(weightf); |
||
644 | } |
||
645 | } |
||
646 | |||
647 | void sappenditem(char *word, int lind, int serial, int weight) |
||
648 | { |
||
649 | int ll; |
||
650 | char *p; |
||
651 | |||
652 | if(!isalnum(*word) || (ll=strlen(word))<2 || |
||
653 | wordchr(taken,word)!=NULL || |
||
654 | wordchr(ignore[lind],word)!=NULL || |
||
655 | takenlen>=MAX_LINELEN-ll-16) |
||
656 | return; |
||
657 | if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return; |
||
658 | for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return; |
||
659 | taken[takenlen++]=' ';taken[takenlen++]=' '; |
||
3718 | reyssat | 660 | ovlstrcpy(taken+takenlen,word); |
10 | reyssat | 661 | takenlen+=ll; tweight+=weight; |
662 | fprintf(indf,"%s:%d?%d\n",word,serial,weight); |
||
663 | } |
||
664 | |||
665 | void onesheet(int serial, int lind) |
||
666 | { |
||
667 | int i; |
||
668 | unsigned char trlist[]={ |
||
669 | s_title,s_description,s_domain,s_keywords,s_remark |
||
670 | }; |
||
671 | #define trcnt (sizeof(trlist)/sizeof(trlist[0])) |
||
672 | char *p1, *p2, buf[MAX_LINELEN+1]; |
||
673 | |||
674 | if(sheet_index(serial)) return; |
||
675 | fprintf(listf,"%s\n",mod[serial].name+3); |
||
676 | fprintf(titf,"%d:%s\n",serial,sindbuf[s_title]); |
||
677 | fprintf(descf,"%d:%s\n",serial,sindbuf[s_description]); |
||
678 | entrycount=mentrycount; dicbuf=mdicbuf; |
||
679 | memmove(entry,mentry,mentrycount*sizeof(entry[0])); |
||
680 | unknown_type=unk_leave; |
||
681 | for(i=0;i<trcnt;i++) { |
||
682 | detag(sindbuf[trlist[i]]); |
||
683 | deaccent(sindbuf[trlist[i]]); |
||
684 | singlespace(sindbuf[trlist[i]]); |
||
685 | suffix_translate(sindbuf[trlist[i]]); |
||
686 | translate(sindbuf[trlist[i]]); |
||
687 | } |
||
688 | taken[0]=0; takenlen=tweight=0; |
||
3718 | reyssat | 689 | ovlstrcpy(buf,sindbuf[s_title]); towords(buf); |
10 | reyssat | 690 | for(p1=find_word_start(buf);*p1; |
691 | p1=find_word_start(p2)) { |
||
692 | p2=find_word_end(p1); if(*p2) *p2++=0; |
||
693 | sappenditem(p1,lind,serial,4); |
||
694 | } |
||
695 | snprintf(buf,sizeof(buf),"%s %s %s %s", |
||
696 | sindbuf[s_description],sindbuf[s_keywords], |
||
697 | sindbuf[s_domain],sindbuf[s_remark]); |
||
698 | towords(buf); |
||
699 | for(p1=find_word_start(buf);*p1; |
||
700 | p1=find_word_start(p2)) { |
||
701 | p2=find_word_end(p1); if(*p2) *p2++=0; |
||
702 | sappenditem(p1,lind,serial,2); |
||
703 | } |
||
704 | entrycount=gentrycount; dicbuf=gdicbuf; |
||
705 | memmove(entry,gentry,gentrycount*sizeof(entry[0])); |
||
706 | unknown_type=unk_delete; |
||
3718 | reyssat | 707 | ovlstrcpy(buf,sindbuf[s_title]); translate(buf); |
10 | reyssat | 708 | for(p1=find_word_start(buf); *p1; |
709 | p1=find_word_start(p2)) { |
||
710 | p2=strchr(p1,','); |
||
711 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
712 | if(strlen(p1)<=0) continue; |
||
713 | sappenditem(p1,lind,serial,4); |
||
714 | } |
||
715 | snprintf(buf,sizeof(buf),"%s, %s, %s, %s", |
||
716 | sindbuf[s_description],sindbuf[s_keywords], |
||
717 | sindbuf[s_domain],sindbuf[s_remark]); |
||
718 | translate(buf); |
||
719 | for(p1=find_word_start(buf); *p1; |
||
720 | p1=find_word_start(p2)) { |
||
721 | p2=strchr(p1,','); |
||
722 | if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1); |
||
723 | if(strlen(p1)<=0) continue; |
||
724 | sappenditem(p1,lind,serial,2); |
||
725 | } |
||
726 | fprintf(weightf,"%d:%d\n",serial,tweight); |
||
727 | } |
||
728 | |||
729 | void sheets(void) |
||
730 | { |
||
731 | int i,j; |
||
732 | char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1]; |
||
733 | char buf[MAX_LINELEN+1]; |
||
734 | |||
735 | for(j=0;j<langcnt;j++) { |
||
736 | snprintf(buf,sizeof(buf),"%s/index/title.%s",sheetdir,lang[j]); |
||
737 | titf=fopen(buf,"w"); |
||
738 | snprintf(buf,sizeof(buf),"%s/index/description.%s",sheetdir,lang[j]); |
||
739 | descf=fopen(buf,"w"); |
||
740 | snprintf(buf,sizeof(buf),"%s/index/%s",sheetdir,lang[j]); |
||
741 | indf=fopen(buf,"w"); |
||
742 | snprintf(buf,sizeof(buf),"%s/index/list.%s",sheetdir,lang[j]); |
||
743 | listf=fopen(buf,"w"); |
||
744 | snprintf(buf,sizeof(buf),"%s/index/weight.%s",sheetdir,lang[j]); |
||
745 | weightf=fopen(buf,"w"); |
||
746 | snprintf(buf,sizeof(buf),"%s/index/addr.%s",sheetdir,lang[j]); |
||
747 | addrf=fopen(buf,"w"); |
||
748 | snprintf(buf,sizeof(buf),"%s/index/serial.%s",sheetdir,lang[j]); |
||
749 | serialf=fopen(buf,"w"); |
||
750 | snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]); |
||
751 | snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]); |
||
752 | snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]); |
||
753 | suffix_dic(sdic); prepare_dic(gdic); |
||
754 | gdicbuf=dicbuf; gentrycount=entrycount; |
||
755 | memmove(gentry,entry,gentrycount*sizeof(entry[0])); |
||
756 | prepare_dic(mdic); |
||
757 | mdicbuf=dicbuf; mentrycount=entrycount; |
||
758 | memmove(mentry,entry,mentrycount*sizeof(entry[0])); |
||
759 | unknown_type=unk_leave; translate(ignore[j]); |
||
760 | for(i=0;i<modcnt;i++) { |
||
761 | if(mod[i].langs[0]!=j) continue; |
||
762 | fprintf(addrf,"%d:%s\n",i,mod[i].name+3); |
||
763 | fprintf(serialf,"%s:%d\n",mod[i].name+3,i); |
||
764 | onesheet(i,j); |
||
765 | } |
||
766 | if(mentrycount>0) free(mdicbuf); |
||
767 | if(gentrycount>0) free(gdicbuf); |
||
768 | if(suffixcnt>0) free(sufbuf); |
||
769 | fclose(titf); fclose(descf); fclose(indf); fclose(listf); |
||
770 | fclose(weightf); fclose(addrf); fclose(serialf); |
||
771 | } |
||
772 | } |
||
773 | |||
774 | int main() |
||
775 | { |
||
776 | prep(); |
||
777 | if(modcnt>0) modules(); |
||
778 | clean(); |
||
779 | sprep(); |
||
780 | if(modcnt>0) sheets(); |
||
781 | return 0; |
||
782 | } |
||
783 |