Subversion Repositories wimsdev

Rev

Rev 15393 | Rev 15440 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
  2.  *
  3.  *  This program is free software; you can redistribute it and/or modify
  4.  *  it under the terms of the GNU General Public License as published by
  5.  *  the Free Software Foundation; either version 2 of the License, or
  6.  *  (at your option) any later version.
  7.  *
  8.  *  This program is distributed in the hope that it will be useful,
  9.  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11.  *  GNU General Public License for more details.
  12.  *
  13.  *  You should have received a copy of the GNU General Public License
  14.  *  along with this program; if not, write to the Free Software
  15.  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16.  */
  17.  
  18. /*  This is an internal program,
  19.  * used to index modules for search engine.
  20.  */
  21.  
  22. #include "../Lib/libwims.h"
  23. #include "translator_.h"
  24. #include "suffix.h"
  25.  
  26. #define MAX_LANGS    MAX_LANGUAGES
  27. #define MAX_MODULES    65536
  28. char *moduledir=    "public_html/modules";
  29. char *sheetdir=     "public_html/bases/sheet";
  30. char *glossarydir=  "public_html/scripts/data/glossary";
  31. char *dicdir=       "public_html/bases";
  32. char *outdir=       "public_html/bases/site2";
  33. char *sheetoutdir=  "public_html/bases/sheet/index";
  34. char *glossaryoutdir=  "public_html/scripts/data/glossary/index";
  35. char *maindic=      "sys/words";
  36. char *groupdic=     "sys/wgrp/wgrp";
  37. char *suffixdic=    "sys/suffix";
  38. char *domaindic=    "sys/domaindic";
  39. char *ignoredic=    "sys/indignore";
  40. char *conffile=     "log/wims.conf";
  41. char *mlistbase=    "lists";
  42.  
  43. char lang[MAX_LANGS][4]={
  44.     "en","fr","cn","es","it","nl","si","ca","pt"
  45. };
  46. #define DEFAULT_LANGCNT    6
  47. char allang[MAX_LANGS][4]={
  48.     "en","fr","cn","es","it","nl","de","si","ca","pt"
  49. };
  50. #define allangcnt 8
  51. char ignore[MAX_LANGS][MAX_LINELEN+1];
  52. char mlistfile[MAX_LANGS][256];
  53. int langcnt;
  54. FILE *langf, *titf, *descf, *weightf, *robotf, *indf, *listf, *addrf, *serialf, *authorf, *versionf, *remf, *titlef;
  55.  
  56. struct cat {
  57.     char *name;
  58.     char typ;
  59. } cat[]={
  60.     {"all_types", 'A'},
  61.     {"exercise",  'X'},
  62.     {"oef",       'O'},
  63.     {"tool",      'T'},
  64.     {"recreation",'R'},
  65.     {"reference", 'Y'},
  66.     {"document",  'D'},
  67.     {"popup",     'P'},
  68.     {"datamodule",'M'}
  69. };
  70. #define catno (sizeof(cat)/sizeof(cat[0]))
  71.  
  72. struct mod {
  73.     char *name;
  74.     unsigned char langs[MAX_LANGS];
  75.     int counts[MAX_LANGS];
  76.     int  langcnt;
  77. } mod[MAX_MODULES];
  78. int modcnt;
  79.  
  80. char *mlist;
  81.  
  82. /*  fold known accented letters to unaccented, other strange characters to space
  83.  *  apostrophe is among the exceptions to be kept (important for multi-word expressions)
  84.  */
  85. void deaccent2(char *p)
  86. {
  87.   char *sp;
  88.   char *v;
  89.   for(sp=p;*sp;sp++) {
  90.   if(*sp<0 && (v=strchr(acctab,*sp))!=NULL)
  91.     *sp=*(deatab+(v-acctab));
  92.   if(!isalnum(*sp) && strchr(",.&$+*",*sp)==0) *sp=' ';
  93.   else *sp=tolower(*sp);
  94.   }
  95. }
  96.  
  97. /*  translate everything non-alphanumeric into space */
  98. void towords(char *p)
  99. {
  100.   char *pp;
  101.   for(pp=p;*pp;pp++) if(!isalnum(*pp) && strchr("&$+*",*pp)==0) *pp=' ';
  102. }
  103.  
  104. /*  Find first occurrence of word */
  105. char *wordchr2(char *p, char *w)
  106. {
  107.   char *r;
  108.  
  109.   for(r=strstr(p,w);r!=NULL &&
  110.     ( (r>p && !isspace(*(r-1))) || (!isspace(*(r+strlen(w))) && *(r+strlen(w))!=0) );
  111.   r=strstr(r+1,w));
  112.   return r;
  113. }
  114.  
  115. char *find_tag_end(char *p)
  116. {
  117.   char *pp;
  118.   pp=p; if(*pp=='<') pp++;
  119.   for(; *pp && *pp!='>'; pp++) {
  120.     if(*pp=='<') {
  121.       pp=find_tag_end(pp)-1; continue;
  122.     }
  123.     if(*pp=='"') {
  124.       pp=strchr(pp+1,'"');
  125.       if(pp==NULL) return p+strlen(p); else continue;
  126.     }
  127.     if(*pp=='\'') {
  128.       pp=strchr(pp+1,'\'');
  129.       if(pp==NULL) return p+strlen(p); else continue;
  130.     }
  131.   }
  132.   if(*pp=='>') pp++;
  133.   return pp;
  134. }
  135.  
  136. char *find_tag(char *p, char *tag)
  137. {
  138.   char *pp;
  139.   int len;
  140.   len=strlen(tag);
  141.   for(pp=strchr(p,'<'); pp!=NULL && *pp; pp=strchr(pp+1,'<')) {
  142.     if(strncasecmp(pp+1,tag,len)==0 && !isalnum(*(pp+1+len))) return pp;
  143.   }
  144.   return p+strlen(p);
  145. }
  146.  
  147. /*  remove all html tags */
  148. void detag(char *p)
  149. {
  150.   char *pp, *p2;
  151.   for(pp=strchr(p,'<'); pp!=NULL; pp=strchr(pp,'<')) {
  152.     p2=find_tag_end(pp);
  153.     if(*p2==0) {*pp=0; return; }
  154.     ovlstrcpy(pp,p2);
  155.   }
  156. }
  157.  
  158. /* add a space after comma to see end of words */
  159.  
  160. void comma(char *p)
  161. {
  162.   char *pp;
  163.   for(pp=strchr(p,','); pp; pp=strchr(pp+1,','))
  164.     string_modify3(p,pp,pp+1,", ");
  165. }
  166.  
  167. /* _getdef from lines.c except the error msg*/
  168. void _getdef(char buf[], char *name, char value[])
  169. {
  170.   char *p1, *p2, *p3, *p4;
  171.  
  172.   if(*name==0) goto nothing;      /* this would create segfault. */
  173.   for(p1=strstr(buf,name); p1!=NULL; p1=strstr(p1+1,name)) {
  174.     p2=find_word_start(p1+strlen(name));
  175.     if((p1>buf && !isspace(*(p1-1))) || *p2!='=') continue;
  176.     p3=p1; while(p3>buf && *(p3-1)!='\n') p3--;
  177.     p3=find_word_start(p3);
  178.     if(p3<p1 && *p3!='!') continue;
  179.     if(p3<p1) {
  180.       p3++; p4=find_word_end(p3);
  181.       if(find_word_start(p4)!=p1) continue;
  182.       if(p4-p3!=3 || (strncmp(p3,"set",3)!=0 &&
  183.            strncmp(p3,"let",3)!=0 &&
  184.            strncmp(p3,"def",3)!=0)) {
  185.         if(p4-p3!=6 || strncmp(p3,"define",6)!=0) continue;
  186.       }
  187.     }
  188.     p2++;p3=strchr(p2,'\n'); if(p3==NULL) p3=p2+strlen(p2);
  189.     p2=find_word_start(p2);
  190.     if(p2>p3) goto nothing;
  191.     /*if(p3-p2>=MAX_LINELEN) user_error("cmd_output_too_long");*/
  192.     memmove(value,p2,p3-p2); value[p3-p2]=0;
  193.     strip_trailing_spaces(value); return;
  194.   }
  195. nothing:
  196.   value[0]=0;
  197. }
  198.  
  199. /*  Get variable definition from a file.
  200.  * Result stored in buffer value of length MAX_LINELEN.
  201.  */
  202. void getdef(char *fname, char *name, char value[])
  203. {
  204.   FILE *f;
  205.   char *buf;
  206.   int l;
  207.  
  208.   value[0]=0;
  209.   f=fopen(fname,"r"); if(f==NULL) return;
  210.   fseek(f,0,SEEK_END); l=ftell(f); fseek(f,0,SEEK_SET);
  211.   buf=xmalloc(l+256); l=fread(buf,1,l,f);
  212.   fclose(f);
  213.   if(l<=0) return; else buf[l]=0;
  214.   _getdef(buf,name,value);
  215.   free(buf);
  216. }
  217.  
  218. char *mdicbuf, *gdicbuf, *ddicbuf, *gentry, *mentry, *dentry;
  219.  
  220. int gentrycount, mentrycount, dentrycount;
  221.  
  222. /*  Preparation of data */
  223. void prep(void)
  224. {
  225.   char buf[MAX_LINELEN+1];
  226.   char *p1,*p2,*s,*old;
  227.   int i,l,thislang,t;
  228.   FILE *f;
  229.  
  230.   s=getenv("modind_outdir"); if(s!=NULL && *s!=0) outdir=s;
  231.   s=getenv("modind_sheetdir"); if(s!=NULL && *s!=0) sheetdir=s;
  232.   s=getenv("modind_sheetoutdir"); if(s!=NULL && *s!=0) sheetoutdir=s;
  233.   snprintf(buf,sizeof(buf),"%s/addr",outdir);
  234.   addrf=fopen(buf,"w");
  235.   if(!addrf) { fprintf(stderr,"modind: error creating output files addr.\n"); exit(1);}
  236.   snprintf(buf,sizeof(buf),"%s/serial",outdir);
  237.   serialf=fopen(buf,"w");
  238.   if(!serialf) { fprintf(stderr,"modind: error creating output files serial.\n"); exit(1);}
  239.   modcnt=langcnt=0;
  240. /* take the langs declared in conffile */
  241.   getdef(conffile,"site_languages",buf);
  242.   for(p1=buf;*p1;p1++) if(!isalnum(*p1)) *p1=' ';
  243.   for(p1=find_word_start(buf); *p1 && langcnt<MAX_LANGS; p1=find_word_start(p2)) {
  244.     p2=find_word_end(p1);
  245.     if(p2!=p1+2 || !isalpha(*p1) || !isalpha(*(p1+1))) continue;
  246.     memmove(lang[langcnt],p1,2); lang[langcnt++][2]=0;
  247.   }
  248.   if(langcnt==0) {/*  default languages */
  249.     langcnt=DEFAULT_LANGCNT;
  250.   }
  251.   s=getenv("mlist"); if(s==NULL) exit(1);
  252.   l=strlen(s); if(l<0 || l>100*MAX_LINELEN) exit(1);
  253.   mlist=xmalloc(l+16); ovlstrcpy(mlist,s); old="";
  254.   for(i=0;i<langcnt;i++) {
  255.     snprintf(buf,sizeof(buf),"%s/%s.%s",dicdir,ignoredic,lang[i]);
  256.     f=fopen(buf,"r"); if(f==NULL) continue;
  257.     l=fread(ignore[i],1,MAX_LINELEN,f);fclose(f);
  258.     if(l<0 || l>=MAX_LINELEN) l=0;
  259.     ignore[i][l]=0;
  260.   }
  261.   for(t=0, p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES;
  262.         p1=find_word_start(p2), t++) {
  263.     p2=find_word_end(p1);
  264.     l=p2-p1; if(*p2) *p2++=0;
  265.     fprintf(addrf,"%d:%s\n",t,p1);
  266.     fprintf(serialf,"%s:%d\n",p1,t);
  267.     thislang=-1;
  268. /* language is taken from the address */
  269.     if(l>3 && p1[l-3]=='.') {
  270.       for(i=0;i<langcnt;i++) if(strcasecmp(lang[i],p1+l-2)==0) break;
  271.       if(i<langcnt) {p1[l-3]=0; thislang=i;}
  272.       else {/*  unknown language, not referenced */
  273.         continue;
  274.       }
  275.     }
  276.     if(modcnt>0 && strcmp(old,p1)==0 && thislang>=0) {
  277.       if(mod[modcnt-1].langcnt<langcnt) {
  278.         mod[modcnt-1].langs[mod[modcnt-1].langcnt]=thislang;
  279.         mod[modcnt-1].counts[mod[modcnt-1].langcnt]=t;
  280.         (mod[modcnt-1].langcnt)++;
  281.       }
  282.     }
  283.     else {
  284.       mod[modcnt].name=old=p1;
  285.       if(thislang>=0) {
  286.         mod[modcnt].langs[0]=thislang;
  287.         mod[modcnt].langcnt=1;
  288.       }
  289.       else mod[modcnt].langcnt=0;
  290.       mod[modcnt].counts[0]=t;
  291.       modcnt++;
  292.     }
  293.   }
  294.   snprintf(buf,sizeof(buf),"%s/language",outdir);
  295.   langf=fopen(buf,"w");
  296.   snprintf(buf,sizeof(buf),"%s/title",outdir);
  297.   titf=fopen(buf,"w");
  298.   snprintf(buf,sizeof(buf),"%s/description",outdir);
  299.   descf=fopen(buf,"w");
  300.   snprintf(buf,sizeof(buf),"%s/author",outdir);
  301.   authorf=fopen(buf,"w");
  302.   snprintf(buf,sizeof(buf),"%s/version",outdir);
  303.   versionf=fopen(buf,"w");
  304.   snprintf(buf,sizeof(buf),"%s/%s/robot.phtml",outdir,mlistbase);
  305.   robotf=fopen(buf,"w");
  306.   fclose(addrf); fclose(serialf);
  307.   if(!robotf || !versionf || !authorf || !descf || !titf || !langf) {
  308.     fprintf(stderr,"modind: error creating output files.\n");
  309.     exit(1);
  310.   }
  311. }
  312.  
  313. void sprep(void)
  314. {
  315.   char *p1,*p2,*s;
  316.   int i,l,thislang;
  317.  
  318.   modcnt=0;
  319.   s=getenv("slist"); if(s==NULL) return;
  320.   l=strlen(s); if(l<0 || l>100*MAX_LINELEN) return;
  321.   mlist=xmalloc(l+16); ovlstrcpy(mlist,s);
  322.   for(p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES; p1=find_word_start(p2)) {
  323.     p2=find_word_end(p1);
  324.     l=p2-p1; if(*p2) *p2++=0;
  325.     for(i=0;i<langcnt;i++) if(strncasecmp(lang[i],p1,2)==0) break;
  326.     if(i<langcnt) thislang=i; else continue;
  327.     mod[modcnt].name=p1;
  328.     mod[modcnt].langs[0]=thislang;
  329.     mod[modcnt].langcnt=1;
  330.     modcnt++;
  331.   }
  332. }
  333.  
  334. void gprep(void)
  335. {
  336.   char *p1,*p2,*s;
  337.   int l,i;
  338.   modcnt=0;
  339.   s=getenv("glist"); if(s==NULL) return;
  340.   l=strlen(s); if(l<0 || l>100*MAX_LINELEN) return;
  341.   mlist=xmalloc(l+16); ovlstrcpy(mlist,s);
  342.   for(p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES; p1=find_word_start(p2)) {
  343.     p2=find_word_end(p1);
  344.     if(*p2) *p2++=0;
  345.     s=strchr(p1,'/');
  346.     if(s != NULL) s=strchr(s+1,'/');
  347.     if(s==NULL) {
  348.       fprintf(stderr,"modind: no language %s\n",p1); exit(1);
  349.     }
  350.     s++;
  351.     for(i=0;i<langcnt;i++) if(strncasecmp(lang[i],s,2)==0) break;
  352.     if(i==langcnt) continue;
  353.     mod[modcnt].name=p1;
  354.     mod[modcnt].langs[0]=i;
  355.     mod[modcnt].langcnt=1;
  356.     modcnt++;
  357.   }
  358. }
  359.  
  360. char *sheetindex[]={
  361.   "title", "description",
  362.   "duration", "severity",
  363.   "level", "domain",
  364.   "keywords", "reserved1", "reserved2", "information"
  365. };
  366. /* correspond to the order of sheetindex */
  367. char *glindex[]={
  368.   "gl_title", "gl_description",
  369.   "", "",
  370.   "gl_level", "gl_domain",
  371.   "gl_keywords","","",""};
  372.  
  373. #define SHEETINDEX_NO (sizeof(sheetindex)/sizeof(sheetindex[0]))
  374. char gsindbuf[SHEETINDEX_NO][MAX_LINELEN+1];
  375.  
  376. /* do not modify the order, correspond to the order in the sheet file */
  377. enum{s_title, s_description,
  378.       s_duration, s_severity,
  379.       s_level, s_domain,
  380.       s_keywords, s_reserved1, s_reserved2,
  381.       s_information
  382. };
  383.  
  384. char *modindex[]={
  385.   "title", "description",
  386.   "author", "address", "copyright",
  387.   "version", "wims_version", "language",
  388.   "category", "level", "domain", "keywords",
  389.   "keywords_ca", "keywords_en", "keywords_fr", "keywords_it", "keywords_nl",
  390.   "title_ca", "title_en", "title_fr", "title_it", "title_nl",
  391.   "require"
  392. };
  393. #define MODINDEX_NO (sizeof(modindex)/sizeof(modindex[0]))
  394. char indbuf[MODINDEX_NO][MAX_LINELEN+1];
  395. enum{i_title, i_description,
  396.   i_author,i_address,i_copyright,
  397.   i_version,i_wims_version,i_language,
  398.   i_category,i_level,i_domain,i_keywords,
  399.   i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl,
  400.   i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl,
  401.   i_require
  402. };
  403.  
  404. char *module_special_file[]={
  405.   "intro","help","about"
  406. };
  407. #define MODSPEC_NO (sizeof(module_special_file)/sizeof(module_special_file[0]))
  408. char module_language[4];
  409.  
  410. /*  read and treat module's INDEX file */
  411. int module_index(const char *name)
  412. {
  413.   char *p, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1];
  414.   FILE *indf;
  415.   int i,l;
  416.  
  417.   snprintf(fbuf,sizeof(fbuf),"%s/%s/INDEX",moduledir,name);
  418.   indf=fopen(fbuf,"r");
  419.   if(indf==NULL) {
  420.     fprintf(stderr,"modind: INDEX of %s not found\n",fbuf); return -1;
  421.   }
  422.   l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf);
  423.   if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1;
  424. /* treate all fields in *modindex */
  425.   for(i=0;i<MODINDEX_NO;i++) {
  426.     _getdef(ibuf,modindex[i],indbuf[i]);
  427. /*  compatibility precaution */
  428.     if(indbuf[i][0]==':') indbuf[i][0]='.';
  429.   }
  430.   p=find_word_start(indbuf[i_language]);
  431.   if(isalpha(*p) && isalpha(*(p+1))) {
  432.     memmove(module_language,p,2); module_language[2]=0;
  433.   }
  434.   else ovlstrcpy(module_language,"en");
  435.   return 0;
  436. }
  437.  
  438. int sheet_index(int serial)
  439. {
  440.   char *p1, *p2, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1];
  441.   FILE *indf;
  442.   int i,l;
  443.  
  444.   snprintf(fbuf,sizeof(fbuf),"%s/%s.def",sheetdir,mod[serial].name);
  445.   indf=fopen(fbuf,"r"); if(indf==NULL) return -1;
  446.   l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf);
  447.   if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1;
  448.   for(i=0;i<SHEETINDEX_NO;i++) gsindbuf[i][0]=0;
  449.   for(i=0,p1=find_word_start(ibuf);
  450.       i<SHEETINDEX_NO-1 && *p1!=':' && *p1!=0;
  451.       i++,p1=p2) {
  452.     p2=strchr(p1,'\n');
  453.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  454.     p1=find_word_start(p1); strip_trailing_spaces2(p1);
  455.     snprintf(gsindbuf[i],MAX_LINELEN,"%s",p1);
  456.   }
  457.   p2=strstr(p1,"\n:"); if(p2==NULL) p2=p1+strlen(p1);
  458.   else *p2=0;
  459.   p1=find_word_start(p1); strip_trailing_spaces2(p1);
  460.   for(p2=p1;*p2;p2++) if(*p2=='\n') *p2=' ';
  461.   ovlstrcpy(gsindbuf[s_information],p1);
  462.   return 0;
  463. }
  464.  
  465. int glossary_index(int serial)
  466. {
  467.   char fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1];
  468.   FILE *indf;
  469.   int i,l;
  470.   snprintf(fbuf,sizeof(fbuf),"%s/%s",glossarydir,mod[serial].name);
  471.   indf=fopen(fbuf,"r");
  472.   l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf);
  473.   if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1;
  474.   for(i=0;i<SHEETINDEX_NO;i++) {
  475.     _getdef(ibuf,glindex[i],gsindbuf[i]);
  476.   }
  477.   return 0;
  478. }
  479.  
  480. unsigned char categories[16];
  481. char taken[MAX_LINELEN+1];
  482. int catcnt, takenlen, tweight;
  483.  
  484. /* file management for appenditem */
  485. #define MAX_FILES (MAX_LANGS*catno)
  486.  
  487. char *fnames[MAX_FILES];
  488. FILE *files[MAX_FILES];
  489. int open_files;
  490.  
  491. FILE * file_from_list(char *name){
  492.   int i, l = 0, r = open_files;
  493.   while (r>l){
  494.     int m = (l+r)/2;
  495.     int cmp = strcmp(name,fnames[m]);
  496.     if (!cmp) return files[m];
  497.     if (cmp < 0) r = m; else l = m+1;
  498.   }
  499.   for (i=open_files; i > l; i--) {files[i]=files[i-1]; fnames[i]=fnames[i-1];}
  500.   fnames[l] = xmalloc(MAX_FNAME);
  501.   ovlstrcpy(fnames[l],name);
  502.   open_files++;
  503.   return files[l]=fopen(name,"a");
  504. }
  505.  
  506. void appenditem(char *word, int lind, int serial, int weight, char *l)
  507. {
  508.   char nbuf[MAX_LINELEN+1], buf[MAX_LINELEN+1];
  509.   int i, ll;
  510.   char *p;
  511.   FILE *f;
  512.  
  513.   if(!isalnum(*word) || (ll=strlen(word))<2 ||
  514.      wordchr2(taken,word)!=NULL ||
  515.      wordchr2(ignore[lind],word)!=NULL ||
  516.      takenlen>=MAX_LINELEN-ll-16)
  517.     return;
  518.   if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return;
  519.   for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return;
  520.   taken[takenlen++]=' '; taken[takenlen++]=' ';
  521.   ovlstrcpy(taken+takenlen,word);
  522.   takenlen+=ll; tweight+=weight;
  523.   snprintf(buf,sizeof(buf),"%s:%d?%d\n",word,serial,weight);
  524.   for(i=0;i<catcnt;i++) {
  525.     snprintf(nbuf,sizeof(nbuf),"%s/%c.%s",
  526.        outdir,categories[i],lang[lind]);
  527.     f = file_from_list(nbuf);
  528.     if(f!=NULL) {fputs(buf,f);}
  529.   }
  530. }
  531.  
  532. void appenditem1 (char *buf, int lind, int serial, int weight, char *l )
  533. {
  534.   char *p1, *p2 ;
  535.   for(p1=find_word_start(buf); *p1;
  536.     p1=find_word_start(p2)) {
  537.     p2=strchr(p1,',');
  538.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  539.     if(strlen(p1)<=0) continue;
  540.     appenditem(p1,lind,serial,weight,module_language);
  541.   }
  542. }
  543. void appenditem2 (char *buf, int lind, int serial, int weight, char *l )
  544. {
  545.   char *p1, *p2 ;
  546.   for(p1=find_word_start(buf);*p1;
  547.       p1=find_word_start(p2)) {
  548.     p2=find_word_end(p1); if(*p2) *p2++=0;
  549.     appenditem(p1,lind,serial,weight,module_language);
  550.   }
  551. }
  552. void onemodule(const char *name, int serial, int lind)
  553. {
  554.   int i;
  555.   unsigned char trlist[]={
  556.   i_title,i_description,i_category,i_domain,i_keywords,
  557.   i_require,i_author,
  558.   i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl,
  559.   i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl
  560.   };
  561.   int trcnt=sizeof(trlist)/sizeof(trlist[0]);
  562.   char *p1, *p2, *pp, *q, buf[MAX_LINELEN+1], lbuf[16];
  563.   FILE *f;
  564.  
  565.   if(module_index(name)) return;
  566.   towords(indbuf[i_category]);
  567. /*   list the categories (among A=all,X=eXercise,O,D,...) corresponding
  568.  *   to this module
  569.  */
  570.   for(i=catcnt=0;i<catno && catcnt<16;i++) {
  571.     if(wordchr2(indbuf[i_category],cat[i].name)!=NULL)
  572.       categories[catcnt++]=cat[i].typ;
  573.   }
  574.   if(catcnt==0) return;
  575.   if(categories[0]!=cat[0].typ)
  576.     categories[catcnt++]=cat[0].typ;
  577. /*  write module's name in the category.language files, for instance lists/X.fr
  578.  * for french exercises
  579.  */
  580.   for(i=0;i<catcnt;i++) {
  581.     snprintf(buf,sizeof(buf),"%s/%s/%c.%s",
  582.        outdir,mlistbase,categories[i],lang[lind]);
  583.     f=fopen(buf,"a");
  584.     if(f!=NULL) {fprintf(f,"%s\n",name); fclose(f);}
  585.   }
  586. /*   add serial number and language (resp.title, ...) to corresponding file  */
  587.   fprintf(langf,"%d:%s\n",serial,module_language);
  588.   fprintf(titf,"%d:%s\n",serial,indbuf[i_title]);
  589.   fprintf(descf,"%d:%s\n",serial,indbuf[i_description]);
  590.   fprintf(authorf,"%d:%s\n",serial,indbuf[i_author]);
  591.   fprintf(versionf,"%d:%s\n",serial,indbuf[i_version]);
  592.  
  593. /*   add module's information in html page for robots  */
  594.   snprintf(buf,sizeof(buf),"%s",indbuf[i_description]);
  595.   for(pp=strchr(buf,','); pp; pp=strchr(pp,','))
  596.     string_modify3(buf,pp,pp+1,"&#44;");
  597.   if(strcmp(module_language,lang[lind])==0)
  598.     fprintf(robotf,"%s ,%s,%s,%s,%s\n",name,module_language,name,
  599.         indbuf[i_title], buf);
  600.  
  601. /*   Normalize the information of trlist, using dictionary
  602.  *  -- bases/sys/domain.xx without suffix translation (--> english version)
  603.  */
  604.   entrycount=dentrycount; dicbuf=ddicbuf;
  605.   memmove(entry,dentry,dentrycount*sizeof(entry[0]));
  606.   unknown_type=unk_leave;
  607.   for(i=0;i<trcnt;i++) {
  608.     detag(indbuf[trlist[i]]);
  609.     deaccent2(indbuf[trlist[i]]);
  610.     comma(indbuf[trlist[i]]);
  611.     singlespace2(indbuf[trlist[i]]);
  612.     translate(indbuf[trlist[i]]);
  613.   }
  614. /*   Normalize the information, using dictionary
  615.  *   bases/sys/words.xx with suffix translation
  616.  */
  617.   entrycount=mentrycount; dicbuf=mdicbuf;
  618.   memmove(entry,mentry,mentrycount*sizeof(entry[0]));
  619.   unknown_type=unk_leave;/*  used in translator_.c */
  620.   for(i=0;i<trcnt;i++) {
  621.   suffix_translate(indbuf[trlist[i]]);
  622.   translate(indbuf[trlist[i]]);
  623.   }
  624.  
  625. /* taken contains all words already seen in the module index */
  626.   taken[0]=0; takenlen=tweight=0;
  627. /*  append words of title  */
  628.   ovlstrcpy(buf,indbuf[i_title]); towords(buf);
  629.   appenditem2(buf,lind,serial,4,module_language);
  630.  
  631. /*  extract words of every other information except level */
  632.   snprintf(buf,sizeof(buf),"%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s",
  633.     indbuf[i_description],indbuf[i_keywords],
  634.     indbuf[i_keywords_ca],indbuf[i_keywords_en],indbuf[i_keywords_fr],
  635.     indbuf[i_keywords_it],indbuf[i_keywords_nl],
  636.     indbuf[i_title_ca],indbuf[i_title_en],indbuf[i_title_fr],
  637.     indbuf[i_title_it],indbuf[i_title_nl],
  638.     indbuf[i_domain],indbuf[i_require],indbuf[i_author]);
  639.   towords(buf);
  640.   appenditem2(buf,lind,serial,2,module_language);
  641.  
  642. /*   this time the dictionary is the group dictionary  sys/wgrp/wgrp
  643.  *   with a g (groupdic), not an m (maindic) . see below main, suffix, group.
  644.  *   and delete unknown ?? and translate
  645.  */
  646.   entrycount=gentrycount; dicbuf=gdicbuf;
  647.   memmove(entry,gentry,gentrycount*sizeof(entry[0]));
  648.  
  649. /* append words of every title information  */
  650.   ovlstrcpy(buf,indbuf[i_title]);
  651.   unknown_type=unk_delete;
  652.   translate(buf);
  653.   appenditem1(buf,lind,serial,2,module_language);
  654.  
  655. /* append words of information of description except level  */
  656.   snprintf(buf,sizeof(buf),"%s", indbuf[i_description]);
  657.   unknown_type=unk_delete;
  658.   translate(buf);
  659.   appenditem1(buf,lind,serial,4,module_language);
  660.  
  661. /* append words (or group of words) of keywords and domain  */
  662.   snprintf(buf,sizeof(buf),"%s, %s, %s, %s, %s, %s, %s",
  663.     indbuf[i_domain],indbuf[i_keywords],
  664.     indbuf[i_keywords_ca], indbuf[i_keywords_en],indbuf[i_keywords_fr],
  665.     indbuf[i_keywords_it], indbuf[i_keywords_nl]);
  666.   unknown_type=unk_leave;
  667.   translate(buf);
  668.   appenditem1(buf,lind,serial,2,module_language);
  669.  
  670. /* append level information, with weight 2 */
  671.   snprintf(buf,sizeof(buf),"%s",indbuf[i_level]);
  672.   ovlstrcpy(lbuf,"level");
  673.   for(p1=buf; *p1; p1++) if(!isalnum(*p1)) *p1=' ';
  674.   q=buf+strlen(buf);
  675.   for(p1=find_word_start(buf); (*p1) && (p1 < q) ; p1=find_word_start(p2)) {
  676.     p2=find_word_end(p1);
  677.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  678.     if(strncmp(p1, "Lang" , p2-p1) &&
  679.      (!isalpha(*p1) ||
  680.      (!isdigit(*(p1+1)) && *(p1+1)!=0) ||
  681.      (*(p1+1)!=0 && *(p1+2)!=0)))
  682.        continue;
  683.     *p1=tolower(*p1);
  684.     ovlstrcpy(lbuf+strlen("level"),p1);
  685.     appenditem(lbuf,lind,serial,2,module_language);
  686.   }
  687. /*   append total weight of module to weight file site2/weight.xx  */
  688.   fprintf(weightf,"%d:%d\n",serial,tweight);
  689. }
  690.  
  691. void modules(void)
  692. {
  693.   int i,j,k,d;
  694.   char namebuf[MAX_LINELEN+1];
  695.   char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1], ddic[MAX_LINELEN+1];
  696.  
  697.   for(j=0;j<langcnt;j++) {
  698.     snprintf(namebuf,sizeof(namebuf),"%s/weight.%s",outdir,lang[j]);
  699.     weightf=fopen(namebuf,"w");
  700.     snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]);
  701.     snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]);
  702.     snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]);
  703.     snprintf(ddic,sizeof(ddic),"%s/%s.%s",dicdir,domaindic,lang[j]);
  704.     suffix_dic(sdic); prepare_dic(gdic);
  705.     gdicbuf=dicbuf; gentrycount=entrycount;
  706.     memmove(gentry,entry,gentrycount*sizeof(entry[0]));
  707.     prepare_dic(mdic);
  708.     mdicbuf=dicbuf; mentrycount=entrycount;
  709.     memmove(mentry,entry,mentrycount*sizeof(entry[0]));
  710.     prepare_dic(ddic);
  711.     ddicbuf=dicbuf; dentrycount=entrycount;
  712.     memmove(dentry,entry,dentrycount*sizeof(entry[0]));
  713.     unknown_type=unk_leave; translate(ignore[j]);
  714.     for(i=0;i<modcnt;i++) {
  715.       if(mod[i].langcnt>0) {
  716.       /* look for another language */
  717.         for(d=k=0;k<mod[i].langcnt;k++)
  718.           if(mod[i].langs[k]<mod[i].langs[d]) d=k;
  719.         for(k=0;k<mod[i].langcnt && mod[i].langs[k]!=j;k++);
  720.         if(k>=mod[i].langcnt) k=d;
  721.         snprintf(namebuf,MAX_LINELEN,"%s.%s",mod[i].name,
  722.            lang[mod[i].langs[k]]);
  723.         onemodule(namebuf,mod[i].counts[k],j);
  724.       }
  725.       else {
  726.         onemodule(mod[i].name,mod[i].counts[0],j);
  727.       }
  728.     }
  729.     if(mentrycount>0) free(mdicbuf);
  730.     if(gentrycount>0) free(gdicbuf);
  731.     if(suffixcnt>0) free(sufbuf);
  732.     if(dentrycount>0) free(ddicbuf);
  733.     if(weightf) fclose(weightf);
  734.   }
  735. }
  736. void clean(void)
  737. {
  738.   int i;
  739.   for (i = 0; i < open_files; i++) fclose(files[i]);
  740.   fclose(langf); fclose(titf); fclose(descf); fclose(robotf);
  741.   fclose(authorf); fclose(versionf);
  742. }
  743.  
  744. /* FIXME ? differences with appenditem - use fprintf instead of  snprintf */
  745. void sappenditem(char *word, int lind, int serial, int weight)
  746. {
  747.   int ll;
  748.   char *p;
  749.  
  750.   if(!isalnum(*word) || (ll=strlen(word))<2 ||
  751.      wordchr2(taken,word)!=NULL ||
  752.      wordchr2(ignore[lind],word)!=NULL ||
  753.      takenlen>=MAX_LINELEN-ll-16)
  754.     return;
  755.   if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return;
  756.   for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return;
  757.   taken[takenlen++]=' ';taken[takenlen++]=' ';
  758.   ovlstrcpy(taken+takenlen,word);
  759.   takenlen+=ll; tweight+=weight;
  760.   fprintf(indf,"%s:%d?%d\n",word,serial,weight);
  761. }
  762. /* onesg / onemodule are similar */
  763. void onesg(int serial, int lind, int index(int))
  764. {
  765.   int i;
  766.   unsigned char trlist[]={
  767.     s_title,s_description,s_domain,s_keywords,s_information
  768.   };
  769.   int trcnt=sizeof(trlist)/sizeof(trlist[0]);
  770.   char *p1, *p2, *q, buf[MAX_LINELEN+1], lbuf[16];
  771.  
  772.   if(index(serial)) return;
  773.   fprintf(listf,"%s\n",mod[serial].name);
  774.   fprintf(titf,"%d:%s\n",serial,gsindbuf[s_title]);
  775.   fprintf(descf,"%d:%s\n",serial,gsindbuf[s_description]);
  776.   fprintf(remf,"%d:%s\n",serial,gsindbuf[s_information]);
  777.   fprintf(titlef,"%s:%s\n",mod[serial].name,gsindbuf[s_title]);
  778.  
  779. /*   Normalize the information of trlist, using dictionary
  780.  *  -- bases/sys/domain.xx without suffix translation (--> english version)
  781.  */
  782.   entrycount=dentrycount; dicbuf=ddicbuf;
  783.   memmove(entry,dentry,dentrycount*sizeof(entry[0]));
  784.   unknown_type=unk_leave;
  785.   for(i=0;i<trcnt;i++) {
  786.     detag(gsindbuf[trlist[i]]);
  787.     deaccent2(gsindbuf[trlist[i]]);
  788.     comma(gsindbuf[trlist[i]]);
  789.     singlespace2(gsindbuf[trlist[i]]);
  790.     translate(gsindbuf[trlist[i]]);
  791.   }
  792. /*   Normalize the information, using dictionary
  793.  *   bases/sys/words.xx with suffix translation
  794.  */
  795.   entrycount=mentrycount; dicbuf=mdicbuf;
  796.   memmove(entry,mentry,mentrycount*sizeof(entry[0]));
  797.   unknown_type=unk_leave;/*  used in translator_.c */
  798.   for(i=0;i<trcnt;i++) {
  799.     suffix_translate(gsindbuf[trlist[i]]);
  800.     translate(gsindbuf[trlist[i]]);
  801.   }
  802.  
  803. /* taken contains all words already seen in the module index */
  804.   taken[0]=0; takenlen=tweight=0;
  805. /*  append words of title  */
  806.   ovlstrcpy(buf,gsindbuf[s_title]); towords(buf);
  807.   for(p1=find_word_start(buf);*p1;
  808.       p1=find_word_start(p2)) {
  809.     p2=find_word_end(p1); if(*p2) *p2++=0;
  810.     sappenditem(p1,lind,serial,4);
  811.   }
  812.  
  813. /*  extract words of every other information except level */
  814.   snprintf(buf,sizeof(buf),"%s %s %s %s",
  815.          gsindbuf[s_description],gsindbuf[s_keywords],
  816.          gsindbuf[s_domain],gsindbuf[s_information]);
  817.   towords(buf);
  818.   for(p1=find_word_start(buf);*p1;p1=find_word_start(p2)) {
  819.     p2=find_word_end(p1); if(*p2) *p2++=0;
  820.     sappenditem(p1,lind,serial,2);
  821.   }
  822. /*   this time the dictionary is the group dictionary  sys/wgrp/wgrp
  823.  *   with a g (groupdic), not an m (maindic) . see below main, suffix, group.
  824.  *   and delete unknown ?? and translate
  825.  */
  826.   entrycount=gentrycount; dicbuf=gdicbuf;
  827.   memmove(entry,gentry,gentrycount*sizeof(entry[0]));
  828.  
  829. /*  append words of every title information  */
  830.   ovlstrcpy(buf,gsindbuf[s_title]);
  831.   unknown_type=unk_delete;
  832.   translate(buf);
  833.   for(p1=find_word_start(buf); *p1; p1=find_word_start(p2)) {
  834.     p2=strchr(p1,',');
  835.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  836.     if(strlen(p1)<=0) continue;
  837.     sappenditem(p1,lind,serial,4);
  838.   }
  839.  
  840. /*  append words (or group of words) of keywords and domain  */
  841.   snprintf(buf,sizeof(buf),"%s, %s",
  842.        gsindbuf[s_keywords],
  843.        gsindbuf[s_domain]);
  844.   unknown_type=unk_leave;
  845.   translate(buf);
  846.   for(p1=find_word_start(buf); *p1; p1=find_word_start(p2)) {
  847.     p2=strchr(p1,',');
  848.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  849.     if(strlen(p1)<=0) continue;
  850.     sappenditem(p1,lind,serial,2);
  851.   }
  852.  
  853. /*   append level information, with weight 2 */
  854.   snprintf(buf,sizeof(buf),"%s",gsindbuf[s_level]);
  855.   ovlstrcpy(lbuf,"level");
  856.   for(p1=buf; *p1; p1++) if(!isalnum(*p1)) *p1=' ';
  857.   q=buf+strlen(buf);
  858.   for(p1=find_word_start(buf); (*p1) && (p1 < q) ;
  859.   p1=find_word_start(p2)) {
  860.     p2=find_word_end(p1);
  861.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  862.     if(strncmp(p1, "Lang" , p2-p1) &&
  863.         (!isalpha(*p1) || (!isdigit(*(p1+1))) ||
  864.         (*(p1+1)!=0 && *(p1+2)!=0)))
  865.       continue;
  866.     *p1=tolower(*p1);
  867.     ovlstrcpy(lbuf+strlen("level"),p1);
  868.     sappenditem(lbuf,lind,serial,2);
  869.   }
  870. /*   append total weight of module to weight file site2/weight.xx  */
  871.   fprintf(weightf,"%d:%d\n",serial,tweight);
  872. }
  873.  
  874. void sgs(char *outdir, int index(int))
  875. {
  876.   int i,j;
  877.   char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1], ddic[MAX_LINELEN+1];
  878.   char buf[MAX_LINELEN+1];
  879.  
  880.   for(j=0;j<langcnt;j++) {
  881.     snprintf(buf,sizeof(buf),"%s/title.%s",outdir,lang[j]);
  882.     titf=fopen(buf,"w");
  883.     snprintf(buf,sizeof(buf),"%s/description.%s",outdir,lang[j]);
  884.     descf=fopen(buf,"w");
  885.     snprintf(buf,sizeof(buf),"%s/%s",outdir,lang[j]);
  886.     indf=fopen(buf,"w");
  887.     snprintf(buf,sizeof(buf),"%s/list.%s",outdir,lang[j]);
  888.     listf=fopen(buf,"w");
  889.     snprintf(buf,sizeof(buf),"%s/weight.%s",outdir,lang[j]);
  890.     weightf=fopen(buf,"w");
  891.     snprintf(buf,sizeof(buf),"%s/addr.%s",outdir,lang[j]);
  892.     addrf=fopen(buf,"w");
  893.     snprintf(buf,sizeof(buf),"%s/information.%s",outdir,lang[j]);
  894.     remf=fopen(buf,"w");
  895.     snprintf(buf,sizeof(buf),"%s/serial.%s",outdir,lang[j]);
  896.     serialf=fopen(buf,"w");
  897.     snprintf(buf,sizeof(buf),"%s/tit.%s",outdir,lang[j]);
  898.     titlef=fopen(buf,"w");
  899.     if(!titlef || !serialf || !remf || !addrf || !weightf || !listf
  900.       || !indf || !descf || !titf ) {
  901.     fprintf(stderr,"modind: error creating output files for %s.\n",outdir); exit(1);
  902.     }
  903.     snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]);
  904.     snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]);
  905.     snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]);
  906.     snprintf(ddic,sizeof(ddic),"%s/%s.%s",dicdir,domaindic,lang[j]);
  907.     suffix_dic(sdic); prepare_dic(gdic);
  908.     gdicbuf=dicbuf; gentrycount=entrycount;
  909.     memmove(gentry,entry,gentrycount*sizeof(entry[0]));
  910.     prepare_dic(mdic);
  911.     mdicbuf=dicbuf; mentrycount=entrycount;
  912.     memmove(mentry,entry,mentrycount*sizeof(entry[0]));
  913.     prepare_dic(ddic);
  914.     ddicbuf=dicbuf; dentrycount=entrycount;
  915.     memmove(dentry,entry,dentrycount*sizeof(entry[0]));
  916.     unknown_type=unk_leave; translate(ignore[j]);
  917.     for(i=0;i<modcnt;i++) {
  918.       if(mod[i].langs[0]!=j) continue;
  919.       fprintf(addrf,"%d:%s\n",i,mod[i].name);
  920.       fprintf(serialf,"%s:%d\n",mod[i].name,i);
  921.       onesg(i,j,index);
  922.     }
  923.     if(mentrycount>0) free(mdicbuf);
  924.     if(gentrycount>0) free(gdicbuf);
  925.     if(suffixcnt>0) free(sufbuf);
  926.     if(dentrycount>0) free(ddicbuf);
  927.     fclose(titf); fclose(descf); fclose(indf); fclose(listf);
  928.     fclose(weightf); fclose(addrf); fclose(serialf);
  929.   }
  930. }
  931.  
  932. int main()
  933. {
  934.   gentry=xmalloc(entry_size);
  935.   dentry=xmalloc(entry_size);
  936.   mentry=xmalloc(entry_size);
  937.   prep();
  938.   if(modcnt>0) modules();
  939.   clean();
  940.   sprep();
  941.   if(modcnt>0) sgs(sheetoutdir,sheet_index);
  942.   gprep();
  943.   if(modcnt>0) sgs(glossaryoutdir,glossary_index);
  944.   return 0;
  945. }
  946.