Subversion Repositories wimsdev

Rev

Rev 8633 | Rev 8685 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

  1. package search;
  2.  
  3. use strict;
  4. use warnings;
  5. use Encode;
  6.  
  7. use Exporter;
  8. our @ISA = 'Exporter';
  9. our @EXPORT = ('hashdomain', 'listdomain', 'out', 'sortuniq', 'treate_accent',
  10.   'treate_dict', 'treate_language', 'dictionnary', 'reverse_dic', 'canonify');
  11.  
  12. sub canonify { my ($w)=@_; treate_accent(lc($w)) }
  13.  
  14. sub treate_dict { my ($file) = @_;
  15.    my %ref = ( ) ; my $ref=\%ref;
  16.    my $text;
  17.    open IN, "$file";
  18.    while (<IN>) {
  19.    $text = $_ ;
  20.    $text =~ s/\n +/\n/g;
  21.    $text =~ s/\\//g;
  22.    $text =~ s/^(\s+)//g;
  23.    $text=~ s/\n+/\n/g;
  24.    my @L = sortuniq(split("\n",$text));
  25.    for my $l (@L) {
  26.      my @la=split(":", $l) ;
  27.      $ref{canonify($la[0])}=join(':',@la[1..@la-1]) if ($la[1]);
  28.     }
  29.    }
  30.   close IN;
  31.  %ref;
  32. }
  33.  
  34. sub dictionnary { my ($file, @words)=@_;
  35.   my %dic=treate_dict($file) ;
  36.   my $dic=\%dic;
  37.   my @W=();
  38.   if (@words) {
  39.    for my $w (@words) {
  40.     next if !($w);
  41.     for my $ww (split(',', $w)) {
  42.      if ( $dic{canonify($ww)} ) { push @W, $dic{canonify($ww)} ; };
  43.      }
  44.    }
  45.    sortuniq ( @W )
  46.    }
  47.    @W;
  48. }
  49.  
  50. sub treate_accent {my ($txt) = @_;
  51.   $txt=decode('iso-8859-1',$txt);
  52.   $txt =~ tr/éèêëàáâãäåùìíîïóôòç/eeeeaaaaaauiiiioooc/;
  53.   $txt =~ s/[ÀÁÂÃÄÅ]/A/g;
  54.   $txt =~ s/Ç/C/g;
  55.   $txt =~ s/[ÈÉÊË]/E/g;
  56.   $txt =~ s/[ÌÏÎÍ]/I/g;
  57.   $txt= encode("iso-8859-1",$txt);
  58.   $txt
  59. }
  60.  
  61. sub out { my ($bloc, $text) = @_;
  62.   open  (OUT, ">$bloc") ;
  63.   print OUT $text ; close OUT;
  64. }
  65.  
  66. sub sortuniq {
  67.   return if !(@_) ;
  68.   my $prev = "not $_[0]";
  69.   grep { $_ ne $prev && ($prev = $_, 1) } sort @_;
  70. }
  71.  
  72. sub listdomain { my ($file)=@_ ;
  73.   my $text='';
  74.   open IN, "$file";
  75.   while (<IN>) { $text = $_ ;
  76.    $text =~ s/[,:]/\n/g;
  77.    $text =~ s/\n +/\n/g;
  78.    $text =~ s/\\//g;
  79.    $text =~ s/^(\s+)//g;
  80.    $text=~ s/\n+/\n/g;
  81.  }
  82.  close IN;
  83.  sortuniq(split("\n",$text))
  84. }
  85.  
  86. ## reversing the domain tree
  87.  
  88. sub hashdomain { my ($file)=@_;
  89.  $/ = undef;
  90.  open IN, "$file";
  91.  my %ref = ( ) ; my $ref=\%ref;
  92.  while (<IN>) { my $text=$_ ; $text=~ s/\\\n\s*//g;
  93.    $text=~ s/\n\s+/\n/g;
  94.    my @text= split("\n", $text);
  95.    for my $line (@text) {
  96.      next if !($line) ;
  97.      $line =~ s/\s+//g;
  98.      my @cut=split(":", $line) ;
  99.      if ($cut[1]) {
  100.        my @son=split(',',$cut[1]);
  101.        $ref->{'next'}{$cut[0]}=$cut[1];
  102.       for my $s (@son) {
  103.         if ($ref->{'prev'}{$s}) { $ref->{'prev'}{$s} .= "," . $cut[0]} else {$ref->{'prev'}{$s} = $cut[0]}
  104.       }
  105.      }
  106.    }
  107. }
  108. close IN;
  109.   %ref
  110. }
  111.  
  112. sub treate_language {
  113.   my $site_language='en fr nl it cn';
  114.   my $conf='../../../log/wims.conf';
  115.   if (-e "$conf") {
  116.    open IN, "$conf";
  117.    while (<IN>) {
  118.      if ($_ =~ s/site_languages=//) { $site_language= $_ ; }
  119.    }
  120.   close IN;
  121.  }
  122. split(' ', $site_language) ;
  123. }
  124.  
  125. sub reverse_dic { my @liste=@_;
  126.   my %ref = ( ) ; my $ref=\%ref;
  127.   for my $file (@liste) {
  128.    open IN, "$file";
  129.    while (<IN>) {
  130.     next if ($_ =~ /^#/);
  131.     my $text= $_ ; $text=~ s/\n//;
  132.     my @text=split(":", $text);
  133.     my $t=$text[0]; $t=~ s/\n//;
  134.     my @L= split(",",$text[1]);
  135.     for my $a (@L) {
  136.      if ($ref{$a}) {  $ref{$a}.= "," . $t ; }
  137.      else { $ref{$a} = $t ; }
  138.     }
  139.    }
  140.    close IN;
  141.   }
  142.   %ref
  143. }
  144.  
  145.