Subversion Repositories wimsdev

Rev

Rev 3808 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
  2.  *
  3.  *  This program is free software; you can redistribute it and/or modify
  4.  *  it under the terms of the GNU General Public License as published by
  5.  *  the Free Software Foundation; either version 2 of the License, or
  6.  *  (at your option) any later version.
  7.  *
  8.  *  This program is distributed in the hope that it will be useful,
  9.  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11.  *  GNU General Public License for more details.
  12.  *
  13.  *  You should have received a copy of the GNU General Public License
  14.  *  along with this program; if not, write to the Free Software
  15.  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16.  */
  17.  
  18. #define suflim  256
  19. #define sufbuflim 102400
  20.  
  21. int suffixcnt;
  22. struct {
  23.     unsigned char *original;
  24.     int olen;
  25.     unsigned char *replace;
  26. }
  27. suf[suflim];
  28. char *sufbuf;
  29. int sufwordlen, sufminlen;
  30.  
  31.         /* Suffix translation, to be used within translator. */
  32.  
  33. int sufcomp(int t, const unsigned char *s2)
  34. {
  35.     int k;
  36.    
  37.     for(k=0;k<suf[t].olen && k<sufwordlen
  38.         && suf[t].original[k]==s2[sufwordlen-k-1];k++);
  39.     if(k>=suf[t].olen) {
  40.         if(sufwordlen>k) return -1; else return 0;
  41.     }
  42.     else return suf[t].original[k]-s2[sufwordlen-k-1];
  43. }
  44.  
  45.         /* searches a list. Returns index if found, -1 if nomatch.
  46.          * This routine is faster than naive one by one comparisons,
  47.          * and is especially suited for large lists. */
  48. int suffix_list(void *list, int items, size_t item_size, const char *str)
  49. {
  50.     int i1,i2,j,k,t,v;
  51.     char c,d;
  52.    
  53.     if(items<=0) return -1;
  54.     k=sufcomp(0,str);
  55.     if(k==0) return 0; if(k>0) return -1;
  56.     j=items-1; k=sufcomp(j,str);
  57.     if(k==0) return j;
  58.     if(k>0) for(i1=0,i2=j;i2>i1+1;) {
  59.         j=i1+(i2-i1)/2; k=sufcomp(j,str);
  60.         if(k==0) return j;
  61.         if(k>0) {i2=j; continue;}
  62.         if(k<0) {i1=j; continue;}      
  63.     }
  64.     if(k>0 && j>0) j--;
  65.     backcheck:
  66.     v=j;for(t=0;t<suf[j].olen && t<sufwordlen
  67.         && suf[j].original[t]==str[sufwordlen-t-1];t++);
  68.     if(t<sufminlen) return -1; if(t>=suf[j].olen) return j;
  69.     for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t];
  70.         j>=0 && suf[j].original[0]==c && suf[j].olen>t
  71.         && suf[j].original[t-1]==d;j--);
  72.     if(j>=0 && suf[j].original[0]==c &&
  73.        strncmp(suf[j].original,suf[v].original,suf[j].olen)==0)
  74.       return j;
  75.     else goto backcheck;
  76. }
  77.  
  78.         /* Prepare dictionary.  */
  79. void suffix_dic(char *sdicname)
  80. {
  81.     int i,k,l;
  82.     FILE *suff;
  83.     char *p1, *p2, *pp;
  84.     long int flen;
  85.  
  86.     suffixcnt=0; sufminlen=100000;
  87.     suff=fopen(sdicname,"r"); if(suff==NULL) return;
  88.     fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET);
  89.     if(flen>sufbuflim) return;
  90.     sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff);
  91.     fclose(suff);
  92.     if(flen>0 && flen<sufbuflim) sufbuf[flen]=0;
  93.     else return;
  94.     for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) {
  95.         p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
  96.         pp=strchr(p1,':'); if(pp==NULL) continue;
  97.         *pp++=0;
  98.         strip_trailing_spaces(p1); strip_trailing_spaces(pp);
  99.         p1=find_word_start(p1); pp=find_word_start(pp);
  100.         if(*p1==0) continue;
  101.         if(i>0) {
  102.             k=strcmp(suf[i-1].original,p1);
  103.             if(k>0) {
  104.                 pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
  105.                 error("unsorted_dictionary %s: %s > %s.\n",
  106.                       pp,suf[i-1].original,p1);
  107.             }
  108.             if(k==0) {
  109.                 pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
  110.                 error("duplication_in_dictionary %s: %s.\n",pp,p1);
  111.             }
  112.         }
  113.         suf[i].original=p1; suf[i].olen=l=strlen(p1);
  114.         if(l<sufminlen) sufminlen=l;
  115.         suf[i].replace=pp; i++;
  116.     }
  117.     suffixcnt=i;
  118. }
  119.  
  120.         /* Suffix translation. */
  121. void suffix_translate(char *p)
  122. {
  123.     char *p1, *p2;
  124.     int t;
  125.  
  126.     for(p1=find_word_start(p);
  127.         p1!=NULL && p1-p<MAX_LINELEN && *p1!=0;
  128.         p1=p2) {
  129.         if(!isalpha(*p1)) {p2=p1+1; continue;}
  130.         for(p2=p1;isalpha(*p2);p2++);
  131.         if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue;
  132.         sufwordlen=p2-p1;
  133.         t=suffix_list(suf,suffixcnt,sizeof(suf[0]),p1);
  134.         if(t<0) continue;
  135.         string_modify(p,p2-suf[t].olen,p2,suf[t].replace);
  136.         p2=p2-suf[t].olen+strlen(suf[t].replace);
  137.     }
  138.     p[MAX_LINELEN]=0;
  139. }
  140.  
  141. void suffix(char *p, char *sdicname)
  142. {
  143.     suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p);
  144. }
  145.  
  146.