Rev 11124 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
10 | reyssat | 1 | /* Copyright (C) 1998-2003 XIAO, Gang of Universite de Nice - Sophia Antipolis |
2 | * |
||
3 | * This program is free software; you can redistribute it and/or modify |
||
4 | * it under the terms of the GNU General Public License as published by |
||
5 | * the Free Software Foundation; either version 2 of the License, or |
||
6 | * (at your option) any later version. |
||
7 | * |
||
8 | * This program is distributed in the hope that it will be useful, |
||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
11 | * GNU General Public License for more details. |
||
12 | * |
||
13 | * You should have received a copy of the GNU General Public License |
||
14 | * along with this program; if not, write to the Free Software |
||
15 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
||
16 | */ |
||
17 | |||
7676 | bpr | 18 | /* Extract text from phtml file */ |
10 | reyssat | 19 | |
20 | /*************** Customization: change values hereafter ****************/ |
||
21 | |||
7676 | bpr | 22 | /* limit of data buffers */ |
10 | reyssat | 23 | #define buflim 1024*1024*16 |
24 | |||
25 | /***************** Nothing should need change hereafter *****************/ |
||
26 | |||
27 | #include "../Lib/libwims.h" |
||
28 | char filename[1024]=""; |
||
29 | char *filebuf; |
||
30 | int filelen=0; |
||
31 | |||
7676 | bpr | 32 | /* get the file */ |
10 | reyssat | 33 | void prepare_file(void) |
34 | { |
||
35 | FILE *f; |
||
36 | long int flen; |
||
37 | |||
38 | filelen=0; |
||
39 | f=fopen(filename,"r"); if(f==NULL) return; |
||
40 | fseek(f,0,SEEK_END);flen=ftell(f); fseek(f,0,SEEK_SET); |
||
41 | if(flen>buflim) return; |
||
42 | filebuf=xmalloc(2*flen+1024);flen=fread(filebuf,1,flen,f); |
||
43 | fclose(f); |
||
44 | if(flen>0 && flen<buflim) filebuf[flen]=0; else flen=0; |
||
45 | filelen=flen; |
||
46 | } |
||
47 | |||
48 | void processbuf(void) |
||
49 | { |
||
50 | char *p; |
||
51 | deaccent(filebuf); |
||
52 | for(p=filebuf; *p; p++) *p=tolower(*p); |
||
53 | for(p=strpbrk(filebuf,"'-"); p!=NULL; p=strpbrk(p+1,"'-")) *p=' '; |
||
54 | for(p=strstr(filebuf," "); p!=NULL; p=strstr(p+1," ")) { |
||
7676 | bpr | 55 | *p=' '; ovlstrcpy(p+1,p+6); |
10 | reyssat | 56 | } |
57 | } |
||
58 | |||
59 | void output(void) |
||
60 | { |
||
61 | char *p, *pp, lastc; |
||
62 | p=find_word_start(filebuf); lastc=0; |
||
63 | if(*p=='!' || *p==':') goto cont1; |
||
64 | for(;*p;p++) { |
||
7676 | bpr | 65 | if(*p=='\n') { |
66 | if(!isspace(lastc)) {printf(" "); lastc=' ';} |
||
67 | cont2: p=find_word_start(p); |
||
68 | if(*p=='!' || *p==':') { |
||
11124 | georgesk | 69 | if(lastc!=' ') printf(". "); |
11132 | bpr | 70 | lastc=' '; |
7676 | bpr | 71 | cont1: p=strchr(p,'\n'); |
72 | if(p==NULL) return; |
||
73 | if(*(p-1)=='\\') {p++; goto cont1;} |
||
74 | goto cont2; |
||
75 | } |
||
76 | for(pp=p; isalnum(*pp) || *pp=='_' || *pp=='$'; pp++); |
||
77 | pp=find_word_start(pp); |
||
78 | if(*pp=='=') goto cont1; |
||
79 | } |
||
80 | if(*p=='\\' && *(p+1)=='\n') { |
||
81 | printf("\n"); p++; continue; |
||
82 | } |
||
83 | if(*p=='<' && (isalpha(*(p+1)) || *(p+1)=='/')) { |
||
84 | p=strchr(p,'>'); goto nextp; |
||
85 | } |
||
86 | if(*p=='$') { |
||
87 | if(lastc != ' ') { |
||
88 | if(!isspace(lastc)) printf(" "); |
||
89 | printf(". "); lastc=' '; |
||
90 | } |
||
91 | p++; |
||
92 | if(*p=='(') {p=find_matching(p+1,')'); goto nextp;} |
||
93 | if(*p=='[') {p=find_matching(p+1,']'); goto nextp;} |
||
94 | while(isalnum(*p) || *p=='_') p++; |
||
95 | p--; continue; |
||
96 | } |
||
97 | if(*p=='&') { |
||
98 | char *p2; |
||
99 | for(p2=p+1; isalnum(*p2) || *p2=='#'; p2++); |
||
100 | if(*p2==';') { |
||
101 | p++; if(isalpha(*p)) {printf("%c",*p); lastc=*p;} |
||
102 | p=p2; continue; |
||
103 | } |
||
104 | } |
||
105 | if(!isspace(*p) && strchr(":!?.;,\"()[]{}=/\\+*^%@~`<>|",*p)==NULL) |
||
106 | {printf("%c",*p); lastc=*p;} |
||
107 | else { |
||
108 | if(isspace(*p) && !isspace(lastc)) { |
||
109 | printf(" "); lastc=' '; |
||
110 | } |
||
111 | if(!isspace(*p)) { |
||
112 | switch(lastc) { |
||
113 | case ' ': printf(". "); lastc=' '; break; |
||
114 | case ' ': break; |
||
115 | default: printf(" . "); lastc=' '; break; |
||
116 | } |
||
117 | } |
||
118 | } |
||
119 | nextp: if(p==NULL || *p==0) break; |
||
10 | reyssat | 120 | } |
121 | } |
||
122 | |||
123 | int main(int argc, char *argv[]) |
||
124 | { |
||
125 | if(argc<=1) return 0; |
||
126 | snprintf(filename,sizeof(filename)-128,"%s",argv[1]); |
||
127 | prepare_file(); |
||
128 | processbuf(); |
||
129 | output(); |
||
130 | return 0; |
||
131 | } |
||
132 |