Subversion Repositories wimsdev

Rev

Rev 8161 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10 reyssat 1
/*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
2
 *
3
 *  This program is free software; you can redistribute it and/or modify
4
 *  it under the terms of the GNU General Public License as published by
5
 *  the Free Software Foundation; either version 2 of the License, or
6
 *  (at your option) any later version.
7
 *
8
 *  This program is distributed in the hope that it will be useful,
9
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
 *  GNU General Public License for more details.
12
 *
13
 *  You should have received a copy of the GNU General Public License
14
 *  along with this program; if not, write to the Free Software
15
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
 */
8161 bpr 17
#include "symtext.h"
10 reyssat 18
 
8113 bpr 19
#define suflim    256
10 reyssat 20
#define sufbuflim 102400
21
 
22
int suffixcnt;
23
struct {
24
    unsigned char *original;
25
    int olen;
26
    unsigned char *replace;
8113 bpr 27
}
10 reyssat 28
suf[suflim];
29
char *sufbuf;
30
int sufwordlen, sufminlen;
31
 
8113 bpr 32
/* Suffix translation, to be used within translator. */
10 reyssat 33
 
34
int sufcomp(int t, const unsigned char *s2)
35
{
36
    int k;
8113 bpr 37
 
10 reyssat 38
    for(k=0;k<suf[t].olen && k<sufwordlen
8113 bpr 39
      && suf[t].original[k]==s2[sufwordlen-k-1];k++);
10 reyssat 40
    if(k>=suf[t].olen) {
8113 bpr 41
      if(sufwordlen>k) return -1; else return 0;
10 reyssat 42
    }
43
    else return suf[t].original[k]-s2[sufwordlen-k-1];
44
}
45
 
8113 bpr 46
/* searches a list. Returns index if found, -1 if nomatch.
47
 * This routine is faster than naive one by one comparisons,
48
 * and is especially suited for large lists.
49
 */
3808 kbelabas 50
int suffix_list(void *list, int items, size_t item_size, const unsigned char *str)
10 reyssat 51
{
52
    int i1,i2,j,k,t,v;
3808 kbelabas 53
    unsigned char c,d;
8113 bpr 54
 
10 reyssat 55
    if(items<=0) return -1;
56
    k=sufcomp(0,str);
57
    if(k==0) return 0; if(k>0) return -1;
58
    j=items-1; k=sufcomp(j,str);
59
    if(k==0) return j;
60
    if(k>0) for(i1=0,i2=j;i2>i1+1;) {
8113 bpr 61
      j=i1+(i2-i1)/2; k=sufcomp(j,str);
62
      if(k==0) return j;
63
      if(k>0) {i2=j; continue;}
64
      if(k<0) {i1=j; continue;}
10 reyssat 65
    }
66
    if(k>0 && j>0) j--;
67
    backcheck:
68
    v=j;for(t=0;t<suf[j].olen && t<sufwordlen
8113 bpr 69
      && suf[j].original[t]==str[sufwordlen-t-1];t++);
10 reyssat 70
    if(t<sufminlen) return -1; if(t>=suf[j].olen) return j;
71
    for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t];
8113 bpr 72
      j>=0 && suf[j].original[0]==c && suf[j].olen>t
73
      && suf[j].original[t-1]==d;j--);
74
    if(j>=0 && suf[j].original[0]==c &&
3808 kbelabas 75
       strncmp((char*)suf[j].original,(char*)suf[v].original,suf[j].olen)==0)
10 reyssat 76
      return j;
77
    else goto backcheck;
78
}
79
 
8113 bpr 80
/* Prepare dictionary.  */
10 reyssat 81
void suffix_dic(char *sdicname)
82
{
83
    int i,k,l;
84
    FILE *suff;
85
    char *p1, *p2, *pp;
86
    long int flen;
87
 
88
    suffixcnt=0; sufminlen=100000;
89
    suff=fopen(sdicname,"r"); if(suff==NULL) return;
90
    fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET);
91
    if(flen>sufbuflim) return;
92
    sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff);
93
    fclose(suff);
94
    if(flen>0 && flen<sufbuflim) sufbuf[flen]=0;
95
    else return;
96
    for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) {
8113 bpr 97
      p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
98
      pp=strchr(p1,':'); if(pp==NULL) continue;
99
      *pp++=0;
100
      strip_trailing_spaces(p1); strip_trailing_spaces(pp);
101
      p1=find_word_start(p1); pp=find_word_start(pp);
102
      if(*p1==0) continue;
103
      if(i>0) {
104
          k=strcmp((char*)suf[i-1].original,p1);
105
          if(k>0) {
106
            pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
8195 bpr 107
            sym_error("unsorted_dictionary %s: %s > %s.\n",
8113 bpr 108
                  pp,suf[i-1].original,p1);
109
          }
110
          if(k==0) {
111
            pp=strrchr(sdicname,'/'); if(pp==NULL) pp=sdicname; else pp++;
8195 bpr 112
            sym_error("duplication_in_dictionary %s: %s.\n",pp,p1);
8113 bpr 113
          }
114
      }
115
      suf[i].original=(unsigned char*)p1; suf[i].olen=l=strlen(p1);
116
      if(l<sufminlen) sufminlen=l;
117
      suf[i].replace=(unsigned char*)pp; i++;
10 reyssat 118
    }
119
    suffixcnt=i;
120
}
121
 
8113 bpr 122
/* Suffix translation. */
10 reyssat 123
void suffix_translate(char *p)
124
{
125
    char *p1, *p2;
126
    int t;
127
 
128
    for(p1=find_word_start(p);
8113 bpr 129
      p1!=NULL && p1-p<MAX_LINELEN && *p1!=0;
130
      p1=p2) {
131
       if(!isalpha(*p1)) {p2=p1+1; continue;}
132
       for(p2=p1;isalpha(*p2);p2++);
133
       if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue;
134
       sufwordlen=p2-p1;
135
       t=suffix_list(suf,suffixcnt,sizeof(suf[0]),(unsigned char*)p1);
136
       if(t<0) continue;
137
       string_modify(p,p2-suf[t].olen,p2,(char*)suf[t].replace);
138
       p2=p2-suf[t].olen+strlen((char*)suf[t].replace);
139
     }
140
     p[MAX_LINELEN]=0;
10 reyssat 141
}
142
 
143
void suffix(char *p, char *sdicname)
144
{
145
    suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p);
146
}
147