Subversion Repositories wimsdev

Rev

Rev 12248 | Rev 18183 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. /*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
  2.  *
  3.  *  This program is free software; you can redistribute it and/or modify
  4.  *  it under the terms of the GNU General Public License as published by
  5.  *  the Free Software Foundation; either version 2 of the License, or
  6.  *  (at your option) any later version.
  7.  *
  8.  *  This program is distributed in the hope that it will be useful,
  9.  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  10.  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11.  *  GNU General Public License for more details.
  12.  *
  13.  *  You should have received a copy of the GNU General Public License
  14.  *  along with this program; if not, write to the Free Software
  15.  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  16.  */
  17.  
  18. #include "../Lib/libwims.h"
  19. #include "suffix.h"
  20.  
  21. #define suflim    256
  22. #define sufbuflim 102400
  23.  
  24. int suffixcnt;
  25. struct {
  26.   unsigned char *original;
  27.   int olen;
  28.   unsigned char *replace;
  29. }
  30. suf[suflim];
  31. char *sufbuf;
  32. int sufwordlen, sufminlen;
  33.  
  34. /* Suffix translation, to be used within translator. */
  35.  
  36. int sufcomp(int t, const unsigned char *s2)
  37. {
  38.   int k;
  39.  
  40.   for(k=0;k<suf[t].olen && k<sufwordlen
  41.     && suf[t].original[k]==s2[sufwordlen-k-1];k++);
  42.   if(k>=suf[t].olen) {
  43.     if(sufwordlen>k) return -1; else return 0;
  44.   }
  45.   else return suf[t].original[k]-s2[sufwordlen-k-1];
  46. }
  47.  
  48. /* searches a list. Returns index if found, -1 if nomatch.
  49.  * This routine is faster than naive one by one comparisons,
  50.  * and is especially suited for large lists.
  51.  */
  52. int suffix_list(void *list, int items, size_t item_size, const unsigned char *str)
  53. {
  54.   int i1,i2,j,k,t,v;
  55.   unsigned char c,d;
  56.  
  57.   if(items<=0) return -1;
  58.   k=sufcomp(0,str);
  59.   if(k==0) return 0;
  60.   if(k>0) return -1;
  61.   j=items-1; k=sufcomp(j,str);
  62.   if(k==0) return j;
  63.   if(k>0) for(i1=0,i2=j;i2>i1+1;) {
  64.     j=i1+(i2-i1)/2; k=sufcomp(j,str);
  65.     if(k==0) return j;
  66.     if(k>0) {i2=j; continue;}
  67.     if(k<0) {i1=j; continue;}
  68.   }
  69.   if(k>0 && j>0) j--;
  70.   backcheck:
  71.   v=j;for(t=0;t<suf[j].olen && t<sufwordlen
  72.     && suf[j].original[t]==str[sufwordlen-t-1];t++);
  73.   if(t<sufminlen) return -1;
  74.   if(t>=suf[j].olen) return j;
  75.   for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t];
  76.     j>=0 && suf[j].original[0]==c && suf[j].olen>t
  77.     && suf[j].original[t-1]==d;j--);
  78.   if(j>=0 && suf[j].original[0]==c &&
  79.        strncmp((char*)suf[j].original,(char*)suf[v].original,suf[j].olen)==0)
  80. #if defined(__aarch64__) || defined(_M_ARM64)
  81. /*
  82.  exclude goto backcheck
  83. segfault on ARM64 / aarch64 GNU/Linux Debian 12.2.0-14 (gcc 12.2.0)
  84. */
  85.     return j;
  86. #else
  87.   return j;
  88.   else goto backcheck;
  89. #endif
  90. }
  91.  
  92. /* Prepare dictionary.  */
  93. void suffix_dic(char *sdicname)
  94. {
  95.   int i,l;
  96.   FILE *suff;
  97.   char *p1, *p2, *pp;
  98.   long int flen;
  99.  
  100.   suffixcnt=0; sufminlen=100000;
  101.   suff=fopen(sdicname,"r"); if(suff==NULL) return;
  102.   fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET);
  103.   if(flen>sufbuflim) return;
  104.   sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff);
  105.   fclose(suff);
  106.   if(flen>0 && flen<sufbuflim) sufbuf[flen]=0;
  107.   else return;
  108.   for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) {
  109.   p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
  110.   pp=strchr(p1,':'); if(pp==NULL) continue;
  111.   *pp++=0;
  112.   strip_trailing_spaces2(p1); strip_trailing_spaces2(pp);
  113.   singlespace2(p1);
  114.   p1=find_word_start(p1); pp=find_word_start(pp);
  115.   if(*p1==0) continue;
  116.   suf[i].original=(unsigned char*)p1; suf[i].olen=l=strlen(p1);
  117.   if(l<sufminlen) sufminlen=l;
  118.   suf[i].replace=(unsigned char*)pp; i++;
  119.   }
  120.   suffixcnt=i;
  121. }
  122.  
  123. /* Suffix translation. */
  124. /* FIXME : ne rien faire si le résultat est de longueur inferieur à 2
  125.  * car ensuite cela sera neglige.
  126.  */
  127.  
  128. void suffix_translate(char *p)
  129. {
  130.   char *p1, *p2;
  131.   int t;
  132.  
  133.   for(p1=find_word_start(p);
  134.     p1!=NULL && p1-p<MAX_LINELEN && *p1!=0;
  135.     p1=p2) {
  136.       if(!isalpha(*p1)) {p2=p1+1; continue;}
  137.       for(p2=p1;isalpha(*p2);p2++);
  138.       if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue;
  139.       sufwordlen=p2-p1;
  140.       t=suffix_list(suf,suffixcnt,sizeof(suf[0]),(unsigned char*)p1);
  141.       if(t<0) continue;
  142.       string_modify3(p,p2-suf[t].olen,p2,(char*)suf[t].replace);
  143.       p2=p2-suf[t].olen+strlen((char*)suf[t].replace);
  144.    }
  145.    p[MAX_LINELEN]=0;
  146. }
  147.  
  148. void suffix(char *p, char *sdicname)
  149. {
  150.   suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p);
  151. }
  152.  
  153.