Rev 16933 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
11480 | schaersvoo | 1 | #!/usr/bin/perl |
6395 | bpr | 2 | |
3 | use warnings; |
||
4 | use strict; |
||
6407 | bpr | 5 | use Encode qw(encode decode); |
18106 | georgesk | 6 | use lib '/var/lib/wims/public_html/bases/sys'; |
16933 | czzmrn | 7 | use lib "."; |
7016 | bpr | 8 | use search ('out', 'sortuniq', 'treate_accent', 'treate_language', 'treate_dict'); |
6395 | bpr | 9 | |
6519 | bpr | 10 | #use Text::Balanced qw (extract_bracketed extract_tagged); |
7016 | bpr | 11 | my @site_lang=treate_language (); |
6406 | bpr | 12 | my $dir='../../modules'; |
13 | my $site='../site/lists'; |
||
6879 | bpr | 14 | my $ddir='domain'; |
6935 | bpr | 15 | my $outputkeywords='tmp'; |
6410 | bpr | 16 | my $dom_templ='domain/domain.template'; |
6453 | bpr | 17 | my $dom_reverse='domain/reversedomain'; |
6460 | bpr | 18 | my $dom_json='domain/domain.json'; |
6395 | bpr | 19 | #$/ = undef; |
6460 | bpr | 20 | ###'arts','earth_sciences','history' |
21 | my @DOMAIN=('biology','chemistry','history','informatics', |
||
22 | 'language','mathematics','physics'); |
||
6521 | bpr | 23 | |
6460 | bpr | 24 | if (-e $dom_json) { |
25 | open LI, $dom_json; my $text; |
||
6879 | bpr | 26 | while(<LI>) { |
6488 | bpr | 27 | next if (/^#|<--/) ; $text .= $_; |
6460 | bpr | 28 | }; |
29 | $text=~ s/(\[|\])//g; |
||
30 | $text=~ s/\'//g; |
||
31 | push @DOMAIN, split(",\n", $text); |
||
32 | } |
||
6395 | bpr | 33 | |
6460 | bpr | 34 | my %trad = (); |
6453 | bpr | 35 | ## should put all perl programm together |
36 | ## read reversedomain |
||
37 | if (-e $dom_reverse) { |
||
6879 | bpr | 38 | open LI, $dom_reverse; |
6453 | bpr | 39 | while(<LI>) { |
40 | my @s= split(':', $_); |
||
41 | my $k= $s[0]; $k =~ s/\s//g if ($k); |
||
42 | my $v= $s[1]; $v =~ s/\s//g if ($v); |
||
43 | $trad{$k}=$v if (($k) && ($v)); |
||
44 | } |
||
45 | close LI |
||
46 | } |
||
6395 | bpr | 47 | |
6879 | bpr | 48 | for my $lang (@site_lang) { |
6395 | bpr | 49 | my %Domain = ( ) ; |
50 | my $Domain=\%Domain; |
||
51 | my @KEYWORDS=(); |
||
6406 | bpr | 52 | if (-e "$site/A.$lang") { |
53 | open LI, "$site/A.$lang"; |
||
6879 | bpr | 54 | while(<LI>){ my $F=$_ ; chomp $F; |
6406 | bpr | 55 | my $file= "$dir/$F/INDEX"; |
56 | next if ($file=~/(adm|devel)\//) ; |
||
6519 | bpr | 57 | my @keywords = treate_index ($file, $lang, \%Domain) ; |
6406 | bpr | 58 | push @KEYWORDS, @keywords; |
59 | }; |
||
60 | close LI; |
||
61 | } else { |
||
6879 | bpr | 62 | for my $file (glob("$dir/*/*/*")) { |
6395 | bpr | 63 | next if ($file=~/(adm|devel)\//) ; |
6519 | bpr | 64 | my @keywords = treate_index ($file, $lang, \%Domain) ; |
6395 | bpr | 65 | push @KEYWORDS, @keywords; |
6406 | bpr | 66 | } |
67 | } |
||
6879 | bpr | 68 | |
6460 | bpr | 69 | ### traite les groupes de mots cles |
6410 | bpr | 70 | if (-e $dom_templ) { |
71 | open LI, $dom_templ; |
||
72 | while (<LI>) { my @m=split(":\n",$_); |
||
73 | for my $a (@m) { $a =~ s/_/ /g; |
||
8425 | bpr | 74 | push @KEYWORDS, split(",", treate_group($a)) }; |
6410 | bpr | 75 | } |
76 | close LI; |
||
6417 | bpr | 77 | }; |
6460 | bpr | 78 | ### traite les groupes de mots domaine |
6417 | bpr | 79 | my $file="domain/domain.$lang"; |
80 | if (-e $file) { |
||
81 | open LI, $file; |
||
6879 | bpr | 82 | while (<LI>) { s /,/\n/g; s /:/\n/g; s /\n+/\n/g; |
6417 | bpr | 83 | my @m=split("\n",$_); |
84 | for my $a (@m) { $a =~ s/_/ /g; |
||
8425 | bpr | 85 | push @KEYWORDS, split(",", treate_group($a)) }; |
6410 | bpr | 86 | } |
6417 | bpr | 87 | close LI; |
88 | }; |
||
6460 | bpr | 89 | |
6395 | bpr | 90 | out("wgrp/wgrp.$lang", join("\n", sortuniq( @KEYWORDS ))) if (@KEYWORDS); |
6879 | bpr | 91 | |
92 | |||
6488 | bpr | 93 | my @list=(); |
6521 | bpr | 94 | my @ALL=(); |
8717 | bpr | 95 | my @ALL_WIMS=(); |
6460 | bpr | 96 | for my $d (@DOMAIN) { |
6488 | bpr | 97 | if ($Domain{$d}) { |
6521 | bpr | 98 | push @ALL, split("\n",$Domain{$d}); |
6488 | bpr | 99 | out("keywords/$d.$lang.tmp", |
6879 | bpr | 100 | "\"" . |
101 | join("\",\n\"",sortuniq(split("\n",$Domain{$d})) ) |
||
6488 | bpr | 102 | . "'" |
6521 | bpr | 103 | ); |
6488 | bpr | 104 | push @list, $d ; |
105 | }; |
||
6460 | bpr | 106 | } |
6879 | bpr | 107 | next if !(-e "$ddir/domain.$lang"); |
7016 | bpr | 108 | my %dom = treate_dict ("$ddir/domain.$lang"); |
6894 | bpr | 109 | while ( my ($key, $value) = each(%dom) ) { |
8717 | bpr | 110 | push @ALL, split(",", lc(treate_accent($value))); |
111 | push @ALL_WIMS, split(",", lc(treate_accent($value))); |
||
112 | } |
||
6488 | bpr | 113 | out("keywords/list.$lang", join(",",sortuniq(@list))); |
8717 | bpr | 114 | ##only official keywords |
6935 | bpr | 115 | out("$outputkeywords/keywords.$lang.json", |
8717 | bpr | 116 | "[\"" . join("\",\n\"",sortuniq(@ALL_WIMS) ) . "\"]" |
117 | ); |
||
118 | ##all keywords |
||
119 | out("$outputkeywords/all.$lang.json", |
||
6935 | bpr | 120 | "[\"" . join("\",\n\"",sortuniq(@ALL) ) . "\"]" |
6521 | bpr | 121 | ); |
122 | |||
6395 | bpr | 123 | } |
6488 | bpr | 124 | |
6519 | bpr | 125 | ###################################################### |
6488 | bpr | 126 | |
6519 | bpr | 127 | sub treate_index { my ($file, $lang, $ref) = @_; |
6395 | bpr | 128 | my @res = (); my @lu = (); my @l = (); my @dom = (); |
6406 | bpr | 129 | my $keyl ='' ; my $keyw=''; my $keyu; |
6395 | bpr | 130 | open(IN, $file) ; |
6879 | bpr | 131 | while (<IN>) { my $line = $_; |
6395 | bpr | 132 | next if !($line =~ /keywords|domain/) ; |
133 | $line =~ s/,\s+/,/g; $line =~ s/\s+,/,/g; |
||
134 | if ($line =~ /domain=/){ $line =~ s/domain=// ; |
||
135 | $line =~ s/( +\n|\n)//g; |
||
6879 | bpr | 136 | $line =~ s/ +/_/g; |
6395 | bpr | 137 | next if( $line =~ /[^[:ascii:]]/); |
138 | $line = lc($line); |
||
6489 | bpr | 139 | for my $a (split(",", $line)) { |
6395 | bpr | 140 | next if ($a =~ /tevila/) ; |
141 | push @dom, $a ; |
||
142 | }; |
||
143 | } |
||
6406 | bpr | 144 | if ($line =~ /keywords_$lang\s*=/) { |
6395 | bpr | 145 | $keyl=treate_keyword($line); |
146 | next if (!($line =~ /,/)); |
||
6406 | bpr | 147 | @l = treate_group($line); |
6395 | bpr | 148 | }; |
6406 | bpr | 149 | if ($line =~ /keywords\s*=/) { |
6395 | bpr | 150 | $keyu=treate_keyword($line); |
151 | next if (!($line =~ /,/)); |
||
6406 | bpr | 152 | @lu = treate_group($line); |
6395 | bpr | 153 | } |
154 | } |
||
155 | close IN; |
||
156 | if (@l) { push @res, @l } else { push @res, @lu if (@lu) }; |
||
6406 | bpr | 157 | if ($keyl) { $keyw = $keyl } else { $keyw = $keyu if ($keyu); } |
6395 | bpr | 158 | if( @dom ) { |
159 | for my $a (@dom) { |
||
6491 | bpr | 160 | if ($a) {$ref->{$a} .= $keyw . "\n" if ($keyw) ; |
6395 | bpr | 161 | } |
162 | } |
||
163 | } |
||
164 | @res; |
||
165 | } |
||
166 | |||
6406 | bpr | 167 | sub treate_group { my ($line) = @_ ; |
168 | $line=~ s/keywords_(\w+)\s*=\s*//g; |
||
169 | $line=~ s/keywords\s*=\s*//g; |
||
6935 | bpr | 170 | $line=treate_accent($line); |
171 | $line =~ s/\./,/g; |
||
6395 | bpr | 172 | my @k = split(',', $line); |
173 | my @tmp; |
||
174 | for my $la (@k) { $la =~ s/^\s+//g; $la =~ s/\s+$//g; $la=lc($la); |
||
6406 | bpr | 175 | next if !($la); |
6395 | bpr | 176 | next if !($la =~ / /); |
177 | push @tmp, "$la:$la," if ($la); |
||
178 | } |
||
179 | @tmp; |
||
180 | } |
||
181 | |||
6406 | bpr | 182 | sub treate_keyword { my ($line) = @_ ; |
183 | $line=~ s/keywords_(\w+)\s*=\s*//g; |
||
184 | $line=~ s/keywords\s*=\s*//g; |
||
185 | $line=treate_accent($line); |
||
6395 | bpr | 186 | if (!($line =~ /,/)) {$line =~ tr / /,/ ;}; |
187 | my @k = split(',', $line); |
||
188 | my $tmp; |
||
6521 | bpr | 189 | for my $la (@k) { |
6491 | bpr | 190 | $la =~ s/^\s+//g; $la =~ s/\s+$//g; $la=lc($la); |
6521 | bpr | 191 | ##$la=~ s/($nokeyword)//g; |
6935 | bpr | 192 | if ($la && length($la) > 2 ) { |
6879 | bpr | 193 | if ($tmp) { $tmp .= "\n" . join("\n",split(',', $la))} |
6491 | bpr | 194 | else |
195 | {$tmp = join("\n",split(',', $la))} |
||
6395 | bpr | 196 | } |
6491 | bpr | 197 | } |
6727 | bpr | 198 | #$tmp=~ s/'/\\'/g if ($tmp); |
6395 | bpr | 199 | $tmp; |
200 | } |
||
201 |