Subversion Repositories wimsdev

Rev

Rev 12248 | Rev 18183 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10 reyssat 1
/*    Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis
2
 *
3
 *  This program is free software; you can redistribute it and/or modify
4
 *  it under the terms of the GNU General Public License as published by
5
 *  the Free Software Foundation; either version 2 of the License, or
6
 *  (at your option) any later version.
7
 *
8
 *  This program is distributed in the hope that it will be useful,
9
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
10
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11
 *  GNU General Public License for more details.
12
 *
13
 *  You should have received a copy of the GNU General Public License
14
 *  along with this program; if not, write to the Free Software
15
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16
 */
17
 
8123 bpr 18
#include "../Lib/libwims.h"
19
#include "suffix.h"
20
 
6895 bpr 21
#define suflim    256
10 reyssat 22
#define sufbuflim 102400
23
 
24
int suffixcnt;
25
struct {
12248 bpr 26
  unsigned char *original;
27
  int olen;
28
  unsigned char *replace;
6895 bpr 29
}
10 reyssat 30
suf[suflim];
31
char *sufbuf;
32
int sufwordlen, sufminlen;
33
 
6895 bpr 34
/* Suffix translation, to be used within translator. */
10 reyssat 35
 
36
int sufcomp(int t, const unsigned char *s2)
37
{
12248 bpr 38
  int k;
6895 bpr 39
 
12248 bpr 40
  for(k=0;k<suf[t].olen && k<sufwordlen
41
    && suf[t].original[k]==s2[sufwordlen-k-1];k++);
42
  if(k>=suf[t].olen) {
43
    if(sufwordlen>k) return -1; else return 0;
44
  }
45
  else return suf[t].original[k]-s2[sufwordlen-k-1];
10 reyssat 46
}
47
 
6895 bpr 48
/* searches a list. Returns index if found, -1 if nomatch.
49
 * This routine is faster than naive one by one comparisons,
8100 bpr 50
 * and is especially suited for large lists.
51
 */
3247 bpr 52
int suffix_list(void *list, int items, size_t item_size, const unsigned char *str)
10 reyssat 53
{
12248 bpr 54
  int i1,i2,j,k,t,v;
55
  unsigned char c,d;
6895 bpr 56
 
12248 bpr 57
  if(items<=0) return -1;
58
  k=sufcomp(0,str);
59
  if(k==0) return 0;
60
  if(k>0) return -1;
61
  j=items-1; k=sufcomp(j,str);
62
  if(k==0) return j;
63
  if(k>0) for(i1=0,i2=j;i2>i1+1;) {
64
    j=i1+(i2-i1)/2; k=sufcomp(j,str);
10 reyssat 65
    if(k==0) return j;
12248 bpr 66
    if(k>0) {i2=j; continue;}
67
    if(k<0) {i1=j; continue;}
68
  }
69
  if(k>0 && j>0) j--;
70
  backcheck:
71
  v=j;for(t=0;t<suf[j].olen && t<sufwordlen
72
    && suf[j].original[t]==str[sufwordlen-t-1];t++);
73
  if(t<sufminlen) return -1;
74
  if(t>=suf[j].olen) return j;
75
  for(j--,c=str[sufwordlen-1],d=str[sufwordlen-t];
76
    j>=0 && suf[j].original[0]==c && suf[j].olen>t
77
    && suf[j].original[t-1]==d;j--);
78
  if(j>=0 && suf[j].original[0]==c &&
3247 bpr 79
       strncmp((char*)suf[j].original,(char*)suf[v].original,suf[j].olen)==0)
18181 schaersvoo 80
#if defined(__aarch64__) || defined(_M_ARM64)
81
/*
82
 exclude goto backcheck
83
segfault on ARM64 / aarch64 GNU/Linux Debian 12.2.0-14 (gcc 12.2.0)
84
*/
12248 bpr 85
    return j;
18181 schaersvoo 86
#else
87
  return j;
12248 bpr 88
  else goto backcheck;
18181 schaersvoo 89
#endif
10 reyssat 90
}
91
 
6895 bpr 92
/* Prepare dictionary.  */
10 reyssat 93
void suffix_dic(char *sdicname)
94
{
12248 bpr 95
  int i,l;
96
  FILE *suff;
97
  char *p1, *p2, *pp;
98
  long int flen;
10 reyssat 99
 
12248 bpr 100
  suffixcnt=0; sufminlen=100000;
101
  suff=fopen(sdicname,"r"); if(suff==NULL) return;
102
  fseek(suff,0,SEEK_END);flen=ftell(suff); fseek(suff,0,SEEK_SET);
103
  if(flen>sufbuflim) return;
104
  sufbuf=xmalloc(flen+16);flen=fread(sufbuf,1,flen,suff);
105
  fclose(suff);
106
  if(flen>0 && flen<sufbuflim) sufbuf[flen]=0;
107
  else return;
108
  for(i=0,p1=sufbuf;p1!=NULL && *p1!=0 && i<suflim;p1=p2) {
109
  p2=strchr(p1+1,'\n'); if(p2>p1) *p2++=0;
110
  pp=strchr(p1,':'); if(pp==NULL) continue;
111
  *pp++=0;
112
  strip_trailing_spaces2(p1); strip_trailing_spaces2(pp);
113
  singlespace2(p1);
114
  p1=find_word_start(p1); pp=find_word_start(pp);
115
  if(*p1==0) continue;
116
  suf[i].original=(unsigned char*)p1; suf[i].olen=l=strlen(p1);
117
  if(l<sufminlen) sufminlen=l;
118
  suf[i].replace=(unsigned char*)pp; i++;
119
  }
120
  suffixcnt=i;
10 reyssat 121
}
122
 
6895 bpr 123
/* Suffix translation. */
124
/* FIXME : ne rien faire si le résultat est de longueur inferieur à 2
125
 * car ensuite cela sera neglige.
126
 */
127
 
10 reyssat 128
void suffix_translate(char *p)
129
{
12248 bpr 130
  char *p1, *p2;
131
  int t;
10 reyssat 132
 
12248 bpr 133
  for(p1=find_word_start(p);
134
    p1!=NULL && p1-p<MAX_LINELEN && *p1!=0;
135
    p1=p2) {
136
      if(!isalpha(*p1)) {p2=p1+1; continue;}
137
      for(p2=p1;isalpha(*p2);p2++);
138
      if(*p2!=0 && strchr(" ,.?!'\"\n`:;()[]{}<>",*p2)==NULL) continue;
139
      sufwordlen=p2-p1;
140
      t=suffix_list(suf,suffixcnt,sizeof(suf[0]),(unsigned char*)p1);
141
      if(t<0) continue;
142
      string_modify3(p,p2-suf[t].olen,p2,(char*)suf[t].replace);
143
      p2=p2-suf[t].olen+strlen((char*)suf[t].replace);
144
   }
145
   p[MAX_LINELEN]=0;
10 reyssat 146
}
147
 
148
void suffix(char *p, char *sdicname)
149
{
12248 bpr 150
  suffix_dic(sdicname); if(suffixcnt>0) suffix_translate(p);
10 reyssat 151
}
152