Subversion Repositories wimsdev

Rev

Rev 11700 | Rev 15375 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
  2.  *
  3.  *  This program is free software; you can redistribute it and/or modify
  4.  *  it under the terms of the GNU General Public License as published by
  5.  *  the Free Software Foundation; either version 2 of the License, or
  6.  *  (at your option) any later version.
  7.  *
  8.  *  This program is distributed in the hope that it will be useful,
  9.  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11.  *  GNU General Public License for more details.
  12.  *
  13.  *  You should have received a copy of the GNU General Public License
  14.  *  along with this program; if not, write to the Free Software
  15.  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16.  */
  17.  
  18. /*  This is an internal program,
  19.  * used to index modules for search engine.
  20.  */
  21.  
  22. #include "../Lib/libwims.h"
  23. #include "translator_.h"
  24. #include "suffix.h"
  25.  
  26. #define MAX_LANGS    MAX_LANGUAGES
  27. #define MAX_MODULES    65536
  28. char *moduledir=    "public_html/modules";
  29. char *sheetdir=     "public_html/bases/sheet";
  30. char *dicdir=       "public_html/bases";
  31. char *outdir=       "public_html/bases/site2";
  32. char *sheetoutdir=  "public_html/bases/sheet/index";
  33. char *maindic=      "sys/words";
  34. char *groupdic=     "sys/wgrp/wgrp";
  35. char *suffixdic=    "sys/suffix";
  36. char *domaindic=    "sys/domaindic";
  37. char *ignoredic=    "sys/indignore";
  38. char *conffile=     "log/wims.conf";
  39. char *mlistbase=    "lists";
  40.  
  41. char lang[MAX_LANGS][4]={
  42.     "en","fr","cn","es","it","nl","si","ca","pt"
  43. };
  44. #define DEFAULT_LANGCNT    6
  45. char allang[MAX_LANGS][4]={
  46.     "en","fr","cn","es","it","nl","de","si","ca","pt"
  47. };
  48. #define allangcnt 8
  49. char ignore[MAX_LANGS][MAX_LINELEN+1];
  50. char mlistfile[MAX_LANGS][256];
  51. int langcnt;
  52. FILE *langf, *titf, *descf, *weightf, *robotf, *indf, *listf, *addrf, *serialf, *authorf, *versionf, *remf, *titlef;
  53.  
  54. struct cat {
  55.     char *name;
  56.     char typ;
  57. } cat[]={
  58.     {"all_types", 'A'},
  59.     {"exercise",  'X'},
  60.     {"oef",       'O'},
  61.     {"tool",      'T'},
  62.     {"recreation",'R'},
  63.     {"reference", 'Y'},
  64.     {"document",  'D'},
  65.     {"popup",     'P'},
  66.     {"datamodule",'M'}
  67. };
  68. #define catno (sizeof(cat)/sizeof(cat[0]))
  69.  
  70. struct mod {
  71.     char *name;
  72.     unsigned char langs[MAX_LANGS];
  73.     int counts[MAX_LANGS];
  74.     int  langcnt;
  75. } mod[MAX_MODULES];
  76. int modcnt;
  77.  
  78. char *mlist;
  79.  
  80. /*  fold known accented letters to unaccented, other strange characters to space
  81.  *  apostrophe is among the exceptions to be kept (important for multi-word expressions)
  82.  */
  83. void deaccent2(char *p)
  84. {
  85.   char *sp;
  86.   char *v;
  87.   for(sp=p;*sp;sp++) {
  88.   if(*sp<0 && (v=strchr(acctab,*sp))!=NULL)
  89.     *sp=*(deatab+(v-acctab));
  90.   if(!isalnum(*sp) && strchr(",.&$+*",*sp)==0) *sp=' ';
  91.   else *sp=tolower(*sp);
  92.   }
  93. }
  94.  
  95. /*  translate everything non-alphanumeric into space */
  96. void towords(char *p)
  97. {
  98.   char *pp;
  99.   for(pp=p;*pp;pp++) if(!isalnum(*pp) && strchr("&$+*",*pp)==0) *pp=' ';
  100. }
  101.  
  102. /*  Find first occurrence of word */
  103. char *wordchr2(char *p, char *w)
  104. {
  105.   char *r;
  106.  
  107.   for(r=strstr(p,w);r!=NULL &&
  108.     ( (r>p && !isspace(*(r-1))) || (!isspace(*(r+strlen(w))) && *(r+strlen(w))!=0) );
  109.   r=strstr(r+1,w));
  110.   return r;
  111. }
  112.  
  113. char *find_tag_end(char *p)
  114. {
  115.   char *pp;
  116.   pp=p; if(*pp=='<') pp++;
  117.   for(; *pp && *pp!='>'; pp++) {
  118.     if(*pp=='<') {
  119.       pp=find_tag_end(pp)-1; continue;
  120.     }
  121.     if(*pp=='"') {
  122.       pp=strchr(pp+1,'"');
  123.       if(pp==NULL) return p+strlen(p); else continue;
  124.     }
  125.     if(*pp=='\'') {
  126.       pp=strchr(pp+1,'\'');
  127.       if(pp==NULL) return p+strlen(p); else continue;
  128.     }
  129.   }
  130.   if(*pp=='>') pp++;
  131.   return pp;
  132. }
  133.  
  134. char *find_tag(char *p, char *tag)
  135. {
  136.   char *pp;
  137.   int len;
  138.   len=strlen(tag);
  139.   for(pp=strchr(p,'<'); pp!=NULL && *pp; pp=strchr(pp+1,'<')) {
  140.     if(strncasecmp(pp+1,tag,len)==0 && !isalnum(*(pp+1+len))) return pp;
  141.   }
  142.   return p+strlen(p);
  143. }
  144.  
  145. /*  remove all html tags */
  146. void detag(char *p)
  147. {
  148.   char *pp, *p2;
  149.   for(pp=strchr(p,'<'); pp!=NULL; pp=strchr(pp,'<')) {
  150.     p2=find_tag_end(pp);
  151.     if(*p2==0) {*pp=0; return; }
  152.     ovlstrcpy(pp,p2);
  153.   }
  154. }
  155.  
  156. /* add a space after comma to see end of words */
  157.  
  158. void comma(char *p)
  159. {
  160.   char *pp;
  161.   for(pp=strchr(p,','); pp; pp=strchr(pp+1,','))
  162.     string_modify3(p,pp,pp+1,", ");
  163. }
  164.  
  165. void _getdef(char buf[], char *name, char value[])
  166. {
  167.   char *p1, *p2, *p3;
  168.  
  169.   value[0]=0;
  170.   for(p1=strstr(buf,name); p1!=NULL; p1=strstr(p1+1,name)) {
  171.     p2=find_word_start(p1+strlen(name));
  172.     if((p1>buf && !isspace(*(p1-1))) || *p2!='=') continue;
  173.     p3=p1; while(p3>buf && isspace(*(p3-1)) && *(p3-1)!='\n') p3--;
  174.     if(p3>buf && *(p3-1)!='\n') continue;
  175.     p3=strchr(p2,'\n');
  176.     p2=find_word_start(p2+1);
  177.     if(p3 <= p2) continue;
  178.     snprintf(value,MAX_LINELEN,"%s",p2);
  179.     if(p3!=NULL && p3-p2<MAX_LINELEN) value[p3-p2]=0;
  180.     strip_trailing_spaces2(value);
  181.     break;
  182.   }
  183. }
  184.  
  185. /*  Get variable definition from a file.
  186.  * Result stored in buffer value of length MAX_LINELEN.
  187.  */
  188. void getdef(char *fname, char *name, char value[])
  189. {
  190.   FILE *f;
  191.   char *buf;
  192.   int l;
  193.  
  194.   value[0]=0;
  195.   f=fopen(fname,"r"); if(f==NULL) return;
  196.   fseek(f,0,SEEK_END); l=ftell(f); fseek(f,0,SEEK_SET);
  197.   buf=xmalloc(l+256); l=fread(buf,1,l,f);
  198.   fclose(f);
  199.   if(l<=0) return; else buf[l]=0;
  200.   _getdef(buf,name,value);
  201.   free(buf);
  202. }
  203.  
  204. char *mdicbuf, *gdicbuf, *ddicbuf, *gentry, *mentry, *dentry;
  205.  
  206. int gentrycount, mentrycount, dentrycount;
  207.  
  208. /*  Preparation of data */
  209. void prep(void)
  210. {
  211.   char buf[MAX_LINELEN+1];
  212.   char *p1,*p2,*s,*old;
  213.   int i,l,thislang,t;
  214.   FILE *f;
  215.  
  216.   s=getenv("modind_outdir"); if(s!=NULL && *s!=0) outdir=s;
  217.   s=getenv("modind_sheetdir"); if(s!=NULL && *s!=0) sheetdir=s;
  218.   s=getenv("modind_sheetoutdir"); if(s!=NULL && *s!=0) sheetoutdir=s;
  219.   snprintf(buf,sizeof(buf),"%s/addr",outdir);
  220.   addrf=fopen(buf,"w");
  221.   if(!addrf) { fprintf(stderr,"modind: error creating output files addr.\n"); exit(1);}
  222.   snprintf(buf,sizeof(buf),"%s/serial",outdir);
  223.   serialf=fopen(buf,"w");
  224.   if(!serialf) { fprintf(stderr,"modind: error creating output files serial.\n"); exit(1);}
  225.   modcnt=langcnt=0;
  226. /* take the langs declared in conffile */
  227.   getdef(conffile,"site_languages",buf);
  228.   for(p1=buf;*p1;p1++) if(!isalnum(*p1)) *p1=' ';
  229.   for(p1=find_word_start(buf); *p1 && langcnt<MAX_LANGS; p1=find_word_start(p2)) {
  230.     p2=find_word_end(p1);
  231.     if(p2!=p1+2 || !isalpha(*p1) || !isalpha(*(p1+1))) continue;
  232.     memmove(lang[langcnt],p1,2); lang[langcnt++][2]=0;
  233.   }
  234.   if(langcnt==0) {/*  default languages */
  235.     langcnt=DEFAULT_LANGCNT;
  236.   }
  237.   s=getenv("mlist"); if(s==NULL) exit(1);
  238.   l=strlen(s); if(l<0 || l>100*MAX_LINELEN) exit(1);
  239.   mlist=xmalloc(l+16); ovlstrcpy(mlist,s); old="";
  240.   for(i=0;i<langcnt;i++) {
  241.     snprintf(buf,sizeof(buf),"%s/%s.%s",dicdir,ignoredic,lang[i]);
  242.     f=fopen(buf,"r"); if(f==NULL) continue;
  243.     l=fread(ignore[i],1,MAX_LINELEN,f);fclose(f);
  244.     if(l<0 || l>=MAX_LINELEN) l=0;
  245.     ignore[i][l]=0;
  246.   }
  247.   for(t=0, p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES;
  248.         p1=find_word_start(p2), t++) {
  249.     p2=find_word_end(p1);
  250.     l=p2-p1; if(*p2) *p2++=0;
  251.     fprintf(addrf,"%d:%s\n",t,p1);
  252.     fprintf(serialf,"%s:%d\n",p1,t);
  253.     thislang=-1;
  254. /* language is taken from the address */
  255.     if(l>3 && p1[l-3]=='.') {
  256.       for(i=0;i<langcnt;i++) if(strcasecmp(lang[i],p1+l-2)==0) break;
  257.       if(i<langcnt) {p1[l-3]=0; thislang=i;}
  258.       else {/*  unknown language, not referenced */
  259.         continue;
  260.       }
  261.     }
  262.     if(modcnt>0 && strcmp(old,p1)==0 && thislang>=0) {
  263.       if(mod[modcnt-1].langcnt<langcnt) {
  264.         mod[modcnt-1].langs[mod[modcnt-1].langcnt]=thislang;
  265.         mod[modcnt-1].counts[mod[modcnt-1].langcnt]=t;
  266.         (mod[modcnt-1].langcnt)++;
  267.       }
  268.     }
  269.     else {
  270.       mod[modcnt].name=old=p1;
  271.       if(thislang>=0) {
  272.         mod[modcnt].langs[0]=thislang;
  273.         mod[modcnt].langcnt=1;
  274.       }
  275.       else mod[modcnt].langcnt=0;
  276.       mod[modcnt].counts[0]=t;
  277.       modcnt++;
  278.     }
  279.   }
  280.   snprintf(buf,sizeof(buf),"%s/language",outdir);
  281.   langf=fopen(buf,"w");
  282.   snprintf(buf,sizeof(buf),"%s/title",outdir);
  283.   titf=fopen(buf,"w");
  284.   snprintf(buf,sizeof(buf),"%s/description",outdir);
  285.   descf=fopen(buf,"w");
  286.   snprintf(buf,sizeof(buf),"%s/author",outdir);
  287.   authorf=fopen(buf,"w");
  288.   snprintf(buf,sizeof(buf),"%s/version",outdir);
  289.   versionf=fopen(buf,"w");
  290.   snprintf(buf,sizeof(buf),"%s/%s/robot.phtml",outdir,mlistbase);
  291.   robotf=fopen(buf,"w");
  292.   fclose(addrf); fclose(serialf);
  293.   if(!robotf || !versionf || !authorf || !descf || !titf || !langf) {
  294.     fprintf(stderr,"modind: error creating output files.\n");
  295.     exit(1);
  296.   }
  297. }
  298.  
  299. void sprep(void)
  300. {
  301.   char *p1,*p2,*s;
  302.   int i,l,thislang;
  303.  
  304.   modcnt=0;
  305.   s=getenv("slist"); if(s==NULL) return;
  306.   l=strlen(s); if(l<0 || l>100*MAX_LINELEN) return;
  307.   mlist=xmalloc(l+16); ovlstrcpy(mlist,s);
  308.   for(p1=find_word_start(mlist); *p1 && modcnt<MAX_MODULES; p1=find_word_start(p2)) {
  309.     p2=find_word_end(p1);
  310.     l=p2-p1; if(*p2) *p2++=0;
  311.     for(i=0;i<langcnt;i++) if(strncasecmp(lang[i],p1,2)==0) break;
  312.     if(i<langcnt) thislang=i; else continue;
  313.     mod[modcnt].name=p1;
  314.     mod[modcnt].langs[0]=thislang;
  315.     mod[modcnt].langcnt=1;
  316.     modcnt++;
  317.   }
  318. }
  319.  
  320. void clean(void)
  321. {
  322.   fclose(langf); fclose(titf); fclose(descf); fclose(robotf);
  323.   fclose(authorf); fclose(versionf);
  324. }
  325.  
  326. char *sheetindex[]={
  327.   "title", "description",
  328.   "duration", "severity",
  329.   "level", "domain",
  330.   "keywords", "reserved1", "reserved2", "information"
  331. };
  332. #define SHEETINDEX_NO (sizeof(sheetindex)/sizeof(sheetindex[0]))
  333. char sindbuf[SHEETINDEX_NO][MAX_LINELEN+1];
  334. enum{s_title, s_description,
  335.       s_duration, s_severity,
  336.       s_level, s_domain,
  337.       s_keywords, s_reserved1, s_reserved2,
  338.       s_information
  339. };
  340.  
  341. char *modindex[]={
  342.   "title", "description",
  343.   "author", "address", "copyright",
  344.   "version", "wims_version", "language",
  345.   "category", "level", "domain", "keywords",
  346.   "keywords_ca", "keywords_en", "keywords_fr", "keywords_it", "keywords_nl",
  347.   "title_ca", "title_en", "title_fr", "title_it", "title_nl",
  348.   "require"
  349. };
  350. #define MODINDEX_NO (sizeof(modindex)/sizeof(modindex[0]))
  351. char indbuf[MODINDEX_NO][MAX_LINELEN+1];
  352. enum{i_title, i_description,
  353.   i_author,i_address,i_copyright,
  354.   i_version,i_wims_version,i_language,
  355.   i_category,i_level,i_domain,i_keywords,
  356.   i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl,
  357.   i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl,
  358.   i_require
  359. };
  360.  
  361. char *module_special_file[]={
  362.   "intro","help","about"
  363. };
  364. #define MODSPEC_NO (sizeof(module_special_file)/sizeof(module_special_file[0]))
  365. char module_language[4];
  366.  
  367. /*  read and treat module's INDEX file */
  368. int module_index(const char *name)
  369. {
  370.   char *p, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1];
  371.   FILE *indf;
  372.   int i,l;
  373.  
  374.   snprintf(fbuf,sizeof(fbuf),"%s/%s/INDEX",moduledir,name);
  375.   indf=fopen(fbuf,"r");
  376.   if(indf==NULL) {
  377.     fprintf(stderr,"modind: INDEX of %s not found\n",fbuf); return -1;
  378.   }
  379.   l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf);
  380.   if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1;
  381. /* treate all fields in *modindex */
  382.   for(i=0;i<MODINDEX_NO;i++) {
  383.     _getdef(ibuf,modindex[i],indbuf[i]);
  384. /*  compatibility precaution */
  385.     if(indbuf[i][0]==':') indbuf[i][0]='.';
  386.   }
  387.   p=find_word_start(indbuf[i_language]);
  388.   if(isalpha(*p) && isalpha(*(p+1))) {
  389.     memmove(module_language,p,2); module_language[2]=0;
  390.   }
  391.   else ovlstrcpy(module_language,"en");
  392.   return 0;
  393. }
  394.  
  395. int sheet_index(int serial)
  396. {
  397.   char *p1, *p2, fbuf[MAX_LINELEN+1], ibuf[MAX_LINELEN+1];
  398.   FILE *indf;
  399.   int i,l;
  400.  
  401.   snprintf(fbuf,sizeof(fbuf),"%s/%s.def",sheetdir,mod[serial].name);
  402.   indf=fopen(fbuf,"r"); if(indf==NULL) return -1;
  403.   l=fread(ibuf,1,MAX_LINELEN,indf); fclose(indf);
  404.   if(l>0 && l<MAX_LINELEN) ibuf[l]=0; else return -1;
  405.   for(i=0;i<SHEETINDEX_NO;i++) sindbuf[i][0]=0;
  406.   for(i=0,p1=find_word_start(ibuf);
  407.       i<SHEETINDEX_NO-1 && *p1!=':' && *p1!=0;
  408.       i++,p1=p2) {
  409.     p2=strchr(p1,'\n');
  410.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  411.     p1=find_word_start(p1); strip_trailing_spaces2(p1);
  412.     snprintf(sindbuf[i],MAX_LINELEN,"%s",p1);
  413.   }
  414.   p2=strstr(p1,"\n:"); if(p2==NULL) p2=p1+strlen(p1);
  415.   else *p2=0;
  416.   p1=find_word_start(p1); strip_trailing_spaces2(p1);
  417.   for(p2=p1;*p2;p2++) if(*p2=='\n') *p2=' ';
  418.   ovlstrcpy(sindbuf[s_information],p1);
  419.   return 0;
  420. }
  421.  
  422. unsigned char categories[16];
  423. char taken[MAX_LINELEN+1];
  424. int catcnt, takenlen, tweight;
  425.  
  426. void appenditem(char *word, int lind, int serial, int weight, char *l)
  427. {
  428.   char nbuf[MAX_LINELEN+1], buf[MAX_LINELEN+1];
  429.   int i, ll;
  430.   char *p;
  431.   FILE *f;
  432.  
  433.   if(!isalnum(*word) || (ll=strlen(word))<2 ||
  434.      wordchr2(taken,word)!=NULL ||
  435.      wordchr2(ignore[lind],word)!=NULL ||
  436.      takenlen>=MAX_LINELEN-ll-16)
  437.     return;
  438.   if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return;
  439.   for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return;
  440.   taken[takenlen++]=' '; taken[takenlen++]=' ';
  441.   ovlstrcpy(taken+takenlen,word);
  442.   takenlen+=ll; tweight+=weight;
  443.   snprintf(buf,sizeof(buf),"%s:%d?%d\n",word,serial,weight);
  444.   for(i=0;i<catcnt;i++) {
  445.     snprintf(nbuf,sizeof(nbuf),"%s/%c.%s",
  446.        outdir,categories[i],lang[lind]);
  447.     f=fopen(nbuf,"a");
  448.     if(f!=NULL) {fputs(buf,f); fclose(f);}
  449.   }
  450. }
  451.  
  452. void appenditem1 (char *buf, int lind, int serial, int weight, char *l )
  453. {
  454.   char *p1, *p2 ;
  455.   for(p1=find_word_start(buf); *p1;
  456.     p1=find_word_start(p2)) {
  457.     p2=strchr(p1,',');
  458.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  459.     if(strlen(p1)<=0) continue;
  460.     appenditem(p1,lind,serial,weight,module_language);
  461.   }
  462. }
  463. void appenditem2 (char *buf, int lind, int serial, int weight, char *l )
  464. {
  465.   char *p1, *p2 ;
  466.   for(p1=find_word_start(buf);*p1;
  467.       p1=find_word_start(p2)) {
  468.     p2=find_word_end(p1); if(*p2) *p2++=0;
  469.     appenditem(p1,lind,serial,weight,module_language);
  470.   }
  471. }
  472. void onemodule(const char *name, int serial, int lind)
  473. {
  474.   int i;
  475.   unsigned char trlist[]={
  476.   i_title,i_description,i_category,i_domain,i_keywords,
  477.   i_require,i_author,
  478.   i_keywords_ca,i_keywords_en,i_keywords_fr,i_keywords_it,i_keywords_nl,
  479.   i_title_ca,i_title_en,i_title_fr,i_title_it,i_title_nl
  480.   };
  481.   #define trcnt (sizeof(trlist)/sizeof(trlist[0]))
  482.   char *p1, *p2, *pp, *q, buf[MAX_LINELEN+1], lbuf[16];
  483.   FILE *f;
  484.  
  485.   if(module_index(name)) return;
  486.   towords(indbuf[i_category]);
  487. /*   list the categories (among A=all,X=eXercise,O,D,...) corresponding
  488.  *   to this module
  489.  */
  490.   for(i=catcnt=0;i<catno && catcnt<16;i++) {
  491.   if(wordchr2(indbuf[i_category],cat[i].name)!=NULL)
  492.     categories[catcnt++]=cat[i].typ;
  493.   }
  494.   if(catcnt==0) return;
  495.   if(categories[0]!=cat[0].typ)
  496.     categories[catcnt++]=cat[0].typ;
  497. /*  write module's name in the category.language files, for instance lists/X.fr
  498.  * for french exercises
  499.  */
  500.   for(i=0;i<catcnt;i++) {
  501.     snprintf(buf,sizeof(buf),"%s/%s/%c.%s",
  502.        outdir,mlistbase,categories[i],lang[lind]);
  503.     f=fopen(buf,"a");
  504.     if(f!=NULL) {fprintf(f,"%s\n",name); fclose(f);}
  505.   }
  506. /*   add serial number and language (resp.title, ...) to corresponding file  */
  507.   fprintf(langf,"%d:%s\n",serial,module_language);
  508.   fprintf(titf,"%d:%s\n",serial,indbuf[i_title]);
  509.   fprintf(descf,"%d:%s\n",serial,indbuf[i_description]);
  510.   fprintf(authorf,"%d:%s\n",serial,indbuf[i_author]);
  511.   fprintf(versionf,"%d:%s\n",serial,indbuf[i_version]);
  512.  
  513. /*   add module's information in html page for robots  */
  514.   snprintf(buf,sizeof(buf),"%s",indbuf[i_description]);
  515.   for(pp=strchr(buf,','); pp; pp=strchr(pp,','))
  516.     string_modify3(buf,pp,pp+1,"&#44;");
  517.   if(strcmp(module_language,lang[lind])==0)
  518.     fprintf(robotf,"%s ,%s,%s,%s,%s\n",name,module_language,name,
  519.         indbuf[i_title], buf);
  520.  
  521. /*   Normalize the information of trlist, using dictionary
  522.  *  -- bases/sys/domain.xx without suffix translation (--> english version)
  523.  */
  524.     entrycount=dentrycount; dicbuf=ddicbuf;
  525.     memmove(entry,dentry,dentrycount*sizeof(entry[0]));
  526.     unknown_type=unk_leave;
  527.     for(i=0;i<trcnt;i++) {
  528.       detag(indbuf[trlist[i]]);
  529.       deaccent2(indbuf[trlist[i]]);
  530.       comma(indbuf[trlist[i]]);
  531.       singlespace2(indbuf[trlist[i]]);
  532.       translate(indbuf[trlist[i]]);
  533.     }
  534. /*   Normalize the information, using dictionary
  535.  *   bases/sys/words.xx with suffix translation
  536.  */
  537.     entrycount=mentrycount; dicbuf=mdicbuf;
  538.     memmove(entry,mentry,mentrycount*sizeof(entry[0]));
  539.     unknown_type=unk_leave;/*  used in translator_.c */
  540.     for(i=0;i<trcnt;i++) {
  541.     suffix_translate(indbuf[trlist[i]]);
  542.     translate(indbuf[trlist[i]]);
  543.     }
  544.  
  545. /* taken contains all words already seen in the module index */
  546.     taken[0]=0; takenlen=tweight=0;
  547. /*  append words of title  */
  548.     ovlstrcpy(buf,indbuf[i_title]); towords(buf);
  549.     appenditem2(buf,lind,serial,4,module_language);
  550.  
  551. /*  extract words of every other information except level */
  552.     snprintf(buf,sizeof(buf),"%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s",
  553.          indbuf[i_description],indbuf[i_keywords],
  554.          indbuf[i_keywords_ca],indbuf[i_keywords_en],indbuf[i_keywords_fr],
  555.          indbuf[i_keywords_it],indbuf[i_keywords_nl],
  556.          indbuf[i_title_ca],indbuf[i_title_en],indbuf[i_title_fr],
  557.          indbuf[i_title_it],indbuf[i_title_nl],
  558.          indbuf[i_domain],indbuf[i_require],indbuf[i_author]);
  559.     towords(buf);
  560.     appenditem2(buf,lind,serial,2,module_language);
  561.  
  562. /*   this time the dictionary is the group dictionary  sys/wgrp/wgrp
  563.  *   with a g (groupdic), not an m (maindic) . see below main, suffix, group.
  564.  *   and delete unknown ?? and translate
  565.  */
  566.   entrycount=gentrycount; dicbuf=gdicbuf;
  567.   memmove(entry,gentry,gentrycount*sizeof(entry[0]));
  568.  
  569. /*  append words of every title information  */
  570.   ovlstrcpy(buf,indbuf[i_title]);
  571.   unknown_type=unk_delete;
  572.   translate(buf);
  573.   appenditem1(buf,lind,serial,2,module_language);
  574.  
  575. /*  append words of information of description except level  */
  576.   snprintf(buf,sizeof(buf),"%s", indbuf[i_description]);
  577.   unknown_type=unk_delete;
  578.   translate(buf);
  579.   appenditem1(buf,lind,serial,4,module_language);
  580.  
  581. /*  append words (or group of words) of keywords and domain  */
  582.   snprintf(buf,sizeof(buf),"%s, %s, %s, %s, %s, %s, %s",
  583.          indbuf[i_domain],indbuf[i_keywords],
  584.          indbuf[i_keywords_ca], indbuf[i_keywords_en],indbuf[i_keywords_fr],
  585.          indbuf[i_keywords_it], indbuf[i_keywords_nl]);
  586.   unknown_type=unk_leave;
  587.   translate(buf);
  588.   appenditem1(buf,lind,serial,2,module_language);
  589.  
  590. /*   append level information, with weight 2 */
  591.   snprintf(buf,sizeof(buf),"%s",indbuf[i_level]);
  592.   ovlstrcpy(lbuf,"level");
  593.   for(p1=buf; *p1; p1++) if(!isalnum(*p1)) *p1=' ';
  594.   q=buf+strlen(buf);
  595.   for(p1=find_word_start(buf); (*p1) && (p1 < q) ;
  596.   p1=find_word_start(p2)) {
  597.     p2=find_word_end(p1);
  598.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  599.     if(strncmp(p1, "Lang" , p2-p1) &&
  600.      (!isalpha(*p1) ||
  601.      (!isdigit(*(p1+1)) && *(p1+1)!=0) ||
  602.      (*(p1+1)!=0 && *(p1+2)!=0)))
  603.        continue;
  604.     *p1=tolower(*p1);
  605.     ovlstrcpy(lbuf+strlen("level"),p1);
  606.     appenditem(lbuf,lind,serial,2,module_language);
  607.   }
  608. /*   append total weight of module to weight file site2/weight.xx  */
  609.   fprintf(weightf,"%d:%d\n",serial,tweight);
  610. }
  611.  
  612. void modules(void)
  613. {
  614.   int i,j,k,d;
  615.   char namebuf[MAX_LINELEN+1];
  616.   char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1], ddic[MAX_LINELEN+1];
  617.  
  618.   for(j=0;j<langcnt;j++) {
  619.     snprintf(namebuf,sizeof(namebuf),"%s/weight.%s",outdir,lang[j]);
  620.     weightf=fopen(namebuf,"w");
  621.     snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]);
  622.     snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]);
  623.     snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]);
  624.     snprintf(ddic,sizeof(ddic),"%s/%s.%s",dicdir,domaindic,lang[j]);
  625.     suffix_dic(sdic); prepare_dic(gdic);
  626.     gdicbuf=dicbuf; gentrycount=entrycount;
  627.     memmove(gentry,entry,gentrycount*sizeof(entry[0]));
  628.     prepare_dic(mdic);
  629.     mdicbuf=dicbuf; mentrycount=entrycount;
  630.     memmove(mentry,entry,mentrycount*sizeof(entry[0]));
  631.     prepare_dic(ddic);
  632.     ddicbuf=dicbuf; dentrycount=entrycount;
  633.     memmove(dentry,entry,dentrycount*sizeof(entry[0]));
  634.     unknown_type=unk_leave; translate(ignore[j]);
  635.     for(i=0;i<modcnt;i++) {
  636.       if(mod[i].langcnt>0) {
  637.         for(d=k=0;k<mod[i].langcnt;k++)
  638.         if(mod[i].langs[k]<mod[i].langs[d]) d=k;
  639.         for(k=0;k<mod[i].langcnt && mod[i].langs[k]!=j;k++);
  640.         if(k>=mod[i].langcnt) k=d;
  641.         snprintf(namebuf,MAX_LINELEN,"%s.%s",mod[i].name,
  642.            lang[mod[i].langs[k]]);
  643.         onemodule(namebuf,mod[i].counts[k],j);
  644.       }
  645.       else {
  646.         onemodule(mod[i].name,mod[i].counts[0],j);
  647.       }
  648.     }
  649.     if(mentrycount>0) free(mdicbuf);
  650.     if(gentrycount>0) free(gdicbuf);
  651.     if(suffixcnt>0) free(sufbuf);
  652.     if(dentrycount>0) free(ddicbuf);
  653.     if(weightf) fclose(weightf);
  654.   }
  655. }
  656.  
  657. /* FIXME ? differences with appenditem - use fprintf instead of  snprintf */
  658. void sappenditem(char *word, int lind, int serial, int weight)
  659. {
  660.   int ll;
  661.   char *p;
  662.  
  663.   if(!isalnum(*word) || (ll=strlen(word))<2 ||
  664.      wordchr2(taken,word)!=NULL ||
  665.      wordchr2(ignore[lind],word)!=NULL ||
  666.      takenlen>=MAX_LINELEN-ll-16)
  667.     return;
  668.   if(ll==2 && (!isdigit(word[0]) || !isalpha(word[1]))) return;
  669.   for(p=word;*p;p++) if(!isalnum(*p) && *p!=' ') return;
  670.   taken[takenlen++]=' ';taken[takenlen++]=' ';
  671.   ovlstrcpy(taken+takenlen,word);
  672.   takenlen+=ll; tweight+=weight;
  673.   fprintf(indf,"%s:%d?%d\n",word,serial,weight);
  674. }
  675.  
  676. void onesheet(int serial, int lind)
  677. {
  678.   int i;
  679.   unsigned char trlist[]={
  680.   s_title,s_description,s_domain,s_keywords,s_information
  681.   };
  682.   #define trcnt (sizeof(trlist)/sizeof(trlist[0]))
  683.   char *p1, *p2, buf[MAX_LINELEN+1];
  684.  
  685.   if(sheet_index(serial)) return;
  686.   fprintf(listf,"%s\n",mod[serial].name+3);
  687.   fprintf(titf,"%d:%s\n",serial,sindbuf[s_title]);
  688.   fprintf(descf,"%d:%s\n",serial,sindbuf[s_description]);
  689.   fprintf(remf,"%d:%s\n",serial,sindbuf[s_information]);
  690.   fprintf(titlef,"%s:%s\n",mod[serial].name,sindbuf[s_title]);
  691.  
  692.   entrycount=dentrycount; dicbuf=ddicbuf;
  693.   memmove(entry,dentry,dentrycount*sizeof(entry[0]));
  694.   unknown_type=unk_leave;
  695.   for(i=0;i<trcnt;i++) {
  696.     detag(sindbuf[trlist[i]]);
  697.     deaccent2(sindbuf[trlist[i]]);
  698.     comma(sindbuf[trlist[i]]);
  699.     singlespace2(sindbuf[trlist[i]]);
  700.     translate(sindbuf[trlist[i]]);
  701.   }
  702.  
  703.   entrycount=mentrycount; dicbuf=mdicbuf;
  704.   memmove(entry,mentry,mentrycount*sizeof(entry[0]));
  705.   unknown_type=unk_leave;
  706.   for(i=0;i<trcnt;i++) {
  707.     suffix_translate(sindbuf[trlist[i]]);
  708.     translate(sindbuf[trlist[i]]);
  709.   }
  710.   taken[0]=0; takenlen=tweight=0;
  711.   ovlstrcpy(buf,sindbuf[s_title]); towords(buf);
  712.   for(p1=find_word_start(buf);*p1;
  713.       p1=find_word_start(p2)) {
  714.     p2=find_word_end(p1); if(*p2) *p2++=0;
  715.     sappenditem(p1,lind,serial,4);
  716.   }
  717.   snprintf(buf,sizeof(buf),"%s %s %s %s",
  718.          sindbuf[s_description],sindbuf[s_keywords],
  719.          sindbuf[s_domain],sindbuf[s_information]);
  720.   towords(buf);
  721.   for(p1=find_word_start(buf);*p1;
  722.   p1=find_word_start(p2)) {
  723.   p2=find_word_end(p1); if(*p2) *p2++=0;
  724.   sappenditem(p1,lind,serial,2);
  725.   }
  726.   entrycount=gentrycount; dicbuf=gdicbuf;
  727.   memmove(entry,gentry,gentrycount*sizeof(entry[0]));
  728.   unknown_type=unk_delete;
  729.   ovlstrcpy(buf,sindbuf[s_title]); translate(buf);
  730.   for(p1=find_word_start(buf); *p1;
  731.   p1=find_word_start(p2)) {
  732.   p2=strchr(p1,',');
  733.   if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  734.   if(strlen(p1)<=0) continue;
  735.   sappenditem(p1,lind,serial,4);
  736.   }
  737.   unknown_type=unk_leave;
  738.   snprintf(buf,sizeof(buf),"%s, %s",
  739.        sindbuf[s_keywords],
  740.        sindbuf[s_domain]);
  741.   translate(buf);
  742.   for(p1=find_word_start(buf); *p1;
  743.   p1=find_word_start(p2)) {
  744.     p2=strchr(p1,',');
  745.     if(p2!=NULL) *p2++=0; else p2=p1+strlen(p1);
  746.     if(strlen(p1)<=0) continue;
  747.     sappenditem(p1,lind,serial,2);
  748.   }
  749.   fprintf(weightf,"%d:%d\n",serial,tweight);
  750. }
  751.  
  752. void sheets(void)
  753. {
  754.   int i,j;
  755.   char mdic[MAX_LINELEN+1], sdic[MAX_LINELEN+1], gdic[MAX_LINELEN+1], ddic[MAX_LINELEN+1];
  756.   char buf[MAX_LINELEN+1];
  757.  
  758.   for(j=0;j<langcnt;j++) {
  759.     snprintf(buf,sizeof(buf),"%s/title.%s",sheetoutdir,lang[j]);
  760.     titf=fopen(buf,"w");
  761.     snprintf(buf,sizeof(buf),"%s/description.%s",sheetoutdir,lang[j]);
  762.     descf=fopen(buf,"w");
  763.     snprintf(buf,sizeof(buf),"%s/%s",sheetoutdir,lang[j]);
  764.     indf=fopen(buf,"w");
  765.     snprintf(buf,sizeof(buf),"%s/list.%s",sheetoutdir,lang[j]);
  766.     listf=fopen(buf,"w");
  767.     snprintf(buf,sizeof(buf),"%s/weight.%s",sheetoutdir,lang[j]);
  768.     weightf=fopen(buf,"w");
  769.     snprintf(buf,sizeof(buf),"%s/addr.%s",sheetoutdir,lang[j]);
  770.     addrf=fopen(buf,"w");
  771.     snprintf(buf,sizeof(buf),"%s/information.%s",sheetoutdir,lang[j]);
  772.     remf=fopen(buf,"w");
  773.     snprintf(buf,sizeof(buf),"%s/serial.%s",sheetoutdir,lang[j]);
  774.     serialf=fopen(buf,"w");
  775.     snprintf(buf,sizeof(buf),"%s/tit.%s",sheetoutdir,lang[j]);
  776.     titlef=fopen(buf,"w");
  777.     if(!titlef || !serialf || !remf || !addrf || !weightf || !listf
  778.       || !indf || !descf || !titf ) {
  779.     fprintf(stderr,"modind: error creating output files for sheet %s.\n",sheetoutdir); exit(1);
  780.     }
  781.     snprintf(mdic,sizeof(mdic),"%s/%s.%s",dicdir,maindic,lang[j]);
  782.     snprintf(sdic,sizeof(sdic),"%s/%s.%s",dicdir,suffixdic,lang[j]);
  783.     snprintf(gdic,sizeof(gdic),"%s/%s.%s",dicdir,groupdic,lang[j]);
  784.     snprintf(ddic,sizeof(ddic),"%s/%s.%s",dicdir,domaindic,lang[j]);
  785.     suffix_dic(sdic); prepare_dic(gdic);
  786.     gdicbuf=dicbuf; gentrycount=entrycount;
  787.     memmove(gentry,entry,gentrycount*sizeof(entry[0]));
  788.     prepare_dic(mdic);
  789.     mdicbuf=dicbuf; mentrycount=entrycount;
  790.     memmove(mentry,entry,mentrycount*sizeof(entry[0]));
  791.     prepare_dic(ddic);
  792.     ddicbuf=dicbuf; dentrycount=entrycount;
  793.     memmove(dentry,entry,dentrycount*sizeof(entry[0]));
  794.     unknown_type=unk_leave; translate(ignore[j]);
  795.     for(i=0;i<modcnt;i++) {
  796.       if(mod[i].langs[0]!=j) continue;
  797.       fprintf(addrf,"%d:%s\n",i,mod[i].name+3);
  798.       fprintf(serialf,"%s:%d\n",mod[i].name+3,i);
  799.       onesheet(i,j);
  800.     }
  801.     if(mentrycount>0) free(mdicbuf);
  802.     if(gentrycount>0) free(gdicbuf);
  803.     if(suffixcnt>0) free(sufbuf);
  804.     if(dentrycount>0) free(ddicbuf);
  805.     fclose(titf); fclose(descf); fclose(indf); fclose(listf);
  806.     fclose(weightf); fclose(addrf); fclose(serialf);
  807.   }
  808. }
  809.  
  810. int main()
  811. {
  812.   gentry=xmalloc(entry_size);
  813.   dentry=xmalloc(entry_size);
  814.   mentry=xmalloc(entry_size);
  815.   prep();
  816.   if(modcnt>0) modules();
  817.   clean();
  818.   sprep();
  819.   if(modcnt>0) sheets();
  820.   return 0;
  821. }
  822.  
  823.