Rev 18174 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6519 | bpr | 1 | package search; |
2 | |||
3 | use strict; |
||
4 | use warnings; |
||
6935 | bpr | 5 | use Encode; |
6519 | bpr | 6 | |
7 | use Exporter; |
||
8 | our @ISA = 'Exporter'; |
||
8618 | bpr | 9 | our @EXPORT = ('hashdomain', 'listdomain', 'out', 'sortuniq', 'treate_accent', |
8633 | bpr | 10 | 'treate_dict', 'treate_language', 'dictionnary', 'reverse_dic', 'canonify'); |
6519 | bpr | 11 | |
7693 | bpr | 12 | sub canonify { my ($w)=@_; treate_accent(lc($w)) } |
13 | |||
7016 | bpr | 14 | sub treate_dict { my ($file) = @_; |
8618 | bpr | 15 | my %ref = ( ) ; my $ref=\%ref; |
16 | my $text; |
||
6519 | bpr | 17 | open IN, "$file"; |
6935 | bpr | 18 | while (<IN>) { |
6519 | bpr | 19 | $text = $_ ; |
20 | $text =~ s/\n +/\n/g; |
||
21 | $text =~ s/\\//g; |
||
22 | $text =~ s/^(\s+)//g; |
||
23 | $text=~ s/\n+/\n/g; |
||
24 | my @L = sortuniq(split("\n",$text)); |
||
25 | for my $l (@L) { |
||
26 | my @la=split(":", $l) ; |
||
8636 | bpr | 27 | $ref{canonify($la[0])}=join(':',@la[1..@la-1]) if ($la[1]); |
6519 | bpr | 28 | } |
29 | } |
||
30 | close IN; |
||
31 | %ref; |
||
32 | } |
||
33 | |||
7693 | bpr | 34 | sub dictionnary { my ($file, @words)=@_; |
35 | my %dic=treate_dict($file) ; |
||
36 | my $dic=\%dic; |
||
37 | my @W=(); |
||
38 | if (@words) { |
||
39 | for my $w (@words) { |
||
40 | next if !($w); |
||
41 | for my $ww (split(',', $w)) { |
||
42 | if ( $dic{canonify($ww)} ) { push @W, $dic{canonify($ww)} ; }; |
||
43 | } |
||
44 | } |
||
45 | sortuniq ( @W ) |
||
46 | } |
||
47 | @W; |
||
48 | } |
||
49 | |||
18175 | czzmrn | 50 | # use same list as in ~/src/Lib/liblines.c |
6935 | bpr | 51 | sub treate_accent {my ($txt) = @_; |
6519 | bpr | 52 | $txt=decode('iso-8859-1',$txt); |
18175 | czzmrn | 53 | $txt =~ tr/éèêëàáâãäåúùûüìíîïóòôöõçñýÿ/eeeeaaaaaauuuuiiiiooooocnyy/; |
6519 | bpr | 54 | $txt =~ s/[ÀÁÂÃÄÅ]/A/g; |
18175 | czzmrn | 55 | $txt =~ s/[ÓÒÔÖÕ]/O/g; |
6519 | bpr | 56 | $txt =~ s/Ç/C/g; |
18175 | czzmrn | 57 | $txt =~ s/Ñ/N/g; |
58 | $txt =~ s/Ý/Y/g; |
||
6519 | bpr | 59 | $txt =~ s/[ÈÉÊË]/E/g; |
60 | $txt =~ s/[ÌÏÎÍ]/I/g; |
||
8937 | bpr | 61 | $txt =~ s/'/ /g; |
6519 | bpr | 62 | $txt= encode("iso-8859-1",$txt); |
63 | $txt |
||
64 | } |
||
65 | |||
66 | sub out { my ($bloc, $text) = @_; |
||
67 | open (OUT, ">$bloc") ; |
||
68 | print OUT $text ; close OUT; |
||
69 | } |
||
70 | |||
71 | sub sortuniq { |
||
6935 | bpr | 72 | return if !(@_) ; |
6519 | bpr | 73 | my $prev = "not $_[0]"; |
74 | grep { $_ ne $prev && ($prev = $_, 1) } sort @_; |
||
75 | } |
||
76 | |||
6935 | bpr | 77 | sub listdomain { my ($file)=@_ ; |
6519 | bpr | 78 | my $text=''; |
79 | open IN, "$file"; |
||
80 | while (<IN>) { $text = $_ ; |
||
81 | $text =~ s/[,:]/\n/g; |
||
82 | $text =~ s/\n +/\n/g; |
||
83 | $text =~ s/\\//g; |
||
84 | $text =~ s/^(\s+)//g; |
||
85 | $text=~ s/\n+/\n/g; |
||
86 | } |
||
87 | close IN; |
||
88 | sortuniq(split("\n",$text)) |
||
89 | } |
||
90 | |||
91 | ## reversing the domain tree |
||
92 | |||
93 | sub hashdomain { my ($file)=@_; |
||
8618 | bpr | 94 | $/ = undef; |
6519 | bpr | 95 | open IN, "$file"; |
96 | my %ref = ( ) ; my $ref=\%ref; |
||
97 | while (<IN>) { my $text=$_ ; $text=~ s/\\\n\s*//g; |
||
98 | $text=~ s/\n\s+/\n/g; |
||
99 | my @text= split("\n", $text); |
||
6935 | bpr | 100 | for my $line (@text) { |
6519 | bpr | 101 | next if !($line) ; |
102 | $line =~ s/\s+//g; |
||
103 | my @cut=split(":", $line) ; |
||
104 | if ($cut[1]) { |
||
105 | my @son=split(',',$cut[1]); |
||
8618 | bpr | 106 | $ref->{'next'}{$cut[0]}=$cut[1]; |
6935 | bpr | 107 | for my $s (@son) { |
8618 | bpr | 108 | if ($ref->{'prev'}{$s}) { $ref->{'prev'}{$s} .= "," . $cut[0]} else {$ref->{'prev'}{$s} = $cut[0]} |
6519 | bpr | 109 | } |
110 | } |
||
111 | } |
||
112 | } |
||
113 | close IN; |
||
114 | %ref |
||
115 | } |
||
116 | |||
6879 | bpr | 117 | sub treate_language { |
18174 | czzmrn | 118 | my $site_language='en fr nl it cn es'; |
6879 | bpr | 119 | my $conf='../../../log/wims.conf'; |
120 | if (-e "$conf") { |
||
121 | open IN, "$conf"; |
||
122 | while (<IN>) { |
||
123 | if ($_ =~ s/site_languages=//) { $site_language= $_ ; } |
||
124 | } |
||
125 | close IN; |
||
126 | } |
||
9089 | bpr | 127 | $site_language=~ s/,/ /g; $site_language=~ s/ +/ /g; |
128 | split(' ', $site_language) ; |
||
6879 | bpr | 129 | } |
7693 | bpr | 130 | |
131 | sub reverse_dic { my @liste=@_; |
||
132 | my %ref = ( ) ; my $ref=\%ref; |
||
133 | for my $file (@liste) { |
||
134 | open IN, "$file"; |
||
135 | while (<IN>) { |
||
136 | next if ($_ =~ /^#/); |
||
137 | my $text= $_ ; $text=~ s/\n//; |
||
138 | my @text=split(":", $text); |
||
139 | my $t=$text[0]; $t=~ s/\n//; |
||
140 | my @L= split(",",$text[1]); |
||
141 | for my $a (@L) { |
||
142 | if ($ref{$a}) { $ref{$a}.= "," . $t ; } |
||
143 | else { $ref{$a} = $t ; } |
||
144 | } |
||
145 | } |
||
146 | close IN; |
||
147 | } |
||
148 | %ref |
||
149 | } |