# <@LICENSE>
# Copyright 2006 Apache Software Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>


=head1 NAME

  Mail::SpamAssassin::Util::Charset.pm - Utility for charset and language

=head1 SYNOPSIS

  my ($decoded, $detected) = Mail::SpamAssassin::Util::Charset::normalize_charset($str, $charset);
  my $language = Mail::SpamAssassin::Util::Charset::get_language($str, $charset);

=head1 DESCRIPTION

This module implements utility methods for charset and language.

=cut

package Mail::SpamAssassin::Util::Charset;

use strict;
use warnings;
use Encode;
use Encode::Guess;
use Encode::Alias;
use Mail::SpamAssassin::Logger;

use vars qw (
  @ISA @EXPORT
);

require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(normalize_charset get_language);

###########################################################################

use constant HAS_ENCODE_DETECT => eval { require Encode::Detect::Detector; };
use constant HAS_ENCODE_HANEXTRA => eval { require Encode::HanExtra; };
use constant HAS_ENCODE_EUCJPMS => eval { require Encode::EUCJPMS; };

###########################################################################

our $KANA_HAN_RE = qr{
  # Hiragana and Katakana
    \xE3[\x81-\x83][\x80-\xBF]
  # Han
  | \xE3[\x90-\xBF][\x80-\xBF]
  | [\xE4-\xE9][\x80-\xBF]{2}
  | \xEF[\xA4-\xAB][\x80-\xBF]
}x;

our %enc2lang;
our %lang2enc;
our %scr2lang;
our %cjkscr2lang;
our @scrorder;

BEGIN {

  # See the following URL about this map:
  #   http://czyborra.com/charsets/iso8859.html
  #   http://czyborra.com/charsets/codepages.html
  #   http://czyborra.com/charsets/cyrillic.html
  #   http://en.wikipedia.org/wiki/ISO_8859
  #   http://www.w3.org/International/O-charset-lang.html
  %enc2lang = (
    # buint-in Encodings and Encode::Byte
    #   N. America
    'ascii'         => 'en',
    'cp437'         => 'en',
    'cp863'         => 'weurope',

    #   W. Europe (Latin1, Latin9)
    #       fr es ca eu pt it sq rm nl de da sv no fi fo is ga gd en af
    'iso-8859-1'    => 'weurope',
    'iso-8859-15'   => 'weurope',
    'cp850'         => 'weurope',
    'cp860'         => 'weurope',
    'cp1252'        => 'weurope',
    'MacRoman'      => 'weurope',

    #   Cntrl. Europe / Latin2 / Latin10
    #       hr cs hu pl sr sk sl
    'iso-8859-2'    => 'ceurope',
    'cp852'         => 'ceurope',
    'cp1250'        => 'ceurope',
    'MacCentralEurRoman' => 'ceurope',
    'MacCroatian'   => 'ceurope',
    'iso-8859-16'   => 'ceurope',
    'MacRomanian'   => 'ceurope',

    #   Latin3 (Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.)
    #       eo mt
    'iso-8859-3'    => 'seurope',

    #   Baltics (Latin4, Latin7)
    #       lv lt
    'iso-8859-4'    => 'neurope',
    'iso-8859-13'   => 'baltic',
    'cp1257'        => 'baltic',

    #   Nordics (Latin6)
    #       et kl iu se
    'iso-8859-10'   => 'nordic',

    #   Cyrillics
    #       bg be uk sr mk ru
    'iso-8859-5'    => 'ru',
    'cp855'         => 'ru',
    'cp1251'        => 'ru',
    'cp866'         => 'ru',
    'MacCyrillic'   => 'ru',
    'koi8-r'        => 'ru',
    'MacUkrainian'  => 'uk',
    'koi8-u'        => 'uk',

    #   Arabic
    'iso-8859-6'    => 'ar',
    'cp864'         => 'ar',
    'cp1256'        => 'ar',
    'MacArabic'     => 'ar',
    'cp1006'        => 'fa',
    'MacFarsi'      => 'fa',

    #   Greek
    'iso-8859-7'    => 'el',
    'cp1253'        => 'el',
    'MacGreek'      => 'el',

    #   Hebrew
    #       he yi
    'iso-8859-8'    => 'he',
    'cp862'         => 'he',
    'cp1255'        => 'he',
    'MacHebrew'     => 'he',

    #   Turkish
    'iso-8859-9'    => 'tr',
    'cp857'         => 'tr',
    'cp1254'        => 'tr',
    'MacTurkish'    => 'tr',

    #   Thai
    'iso-8859-11'   => 'th',
    'cp874'         => 'th',

    #   Celtics (Latin8)
    #       gd cy br
    'iso-8859-14'   => 'celtic',

    #   Vietnamese
    'viscii'        => 'vi',
    'cp1258'        => 'vi',

    # Encode::CN
    'euc-cn'        => 'zh',
    'cp936'         => 'zh',
    'hz'            => 'zh',

    # Encode::TW
    'big5-eten'     => 'zh',
    'big5-hkscs'    => 'zh',
    'cp950'         => 'zh',

    # Encode::JP
    'euc-jp'        => 'ja',
    'shiftjis'      => 'ja',
    '7bit-jis'      => 'ja',
    'iso-2022-jp'   => 'ja',
    'iso-2022-jp-1' => 'ja',
    'cp932'         => 'ja',

    # Encode::KR
    'euc-kr'        => 'ko',
    'cp949'         => 'ko',
    'johab'         => 'ko',
    'iso-2022-kr'   => 'ko',

    # Encode::HanExtra
    'euc-tw'        => 'zh',
    'gb18030'       => 'zh',

    # Encode::JIS2K
    # 'euc-jisx0213'  => 'ja',
    # 'shiftjisx0123' => 'ja',
    # 'iso-2022-jp-3' => 'ja',

    # Encode::EUCJPMS
    'eucJP-ms'      => 'ja',
    'cp51932'       => 'ja',
    'cp50220'       => 'ja',
    'cp50221'       => 'ja',

  );

  # inverse enc2lang to lang2enc
  push @{ $lang2enc{ $enc2lang{$_} } }, $_ for keys %enc2lang;

  %scr2lang = (
    'InLatin1Supplement' => ['weurope'],
    'InLatinExtendedA' => [
      'ceurope',
      'seurope',
      'tr',
      'vi'
    ],
    'InLatinExtendedB' => [
      'nordic',
      'baltic',
      'celtic'
    ],
    'Thai'   => ['th'],
    'Cyrillic' => ['ru', 'uk'],
    'Arabic' => ['ar'],
    'Greek'  => ['el'],
    'Hebrew' => ['he'],
  );

  # better detection for CJK
  @scrorder = ('Han','Hiragana','Katakana','Hangul',keys(%scr2lang));
  %cjkscr2lang = (
    'Hiragana' => ['ja'],
    'Katakana' => ['ja'],
    'Hangul' => ['ko'],
    'Han'    => ['zh', 'ja', 'ko'],
  );

  unless (HAS_ENCODE_HANEXTRA) {
    Encode::Alias::define_alias( qr/^gb18030$/i => ' "euc-cn"' );
  }
  Encode::Alias::define_alias( qr/^unicode-1-1-(.+)$/i => ' "$1"' );
  Encode::Alias::define_alias( qr/^TIS-620$/i => ' "iso-8859-11"' );
  Encode::Alias::define_alias( qr/^x-mac-(.+)$/i => ' "Mac$1"' );
  Encode::Alias::define_alias( qr/^Shift_JIS$/i => ' "cp932"' );
  if (HAS_ENCODE_EUCJPMS) {
    Encode::Alias::define_alias( qr/^iso-2022-jp$/i => ' "cp50221"' );
  }
}

sub get_language {
  my $str = shift; # $str must be UTF-8 encoding
  my $charset = shift;

  return 'en' unless $charset;
  if ($charset !~ /^utf/i) {
    if (exists($enc2lang{$charset})) {
      return $enc2lang{$charset};
    }
    my $decoder = Encode::find_encoding($charset);
    if ($decoder && exists($enc2lang{$decoder->name})) {
      return $enc2lang{$decoder->name};
    }
    return 'en';
  } elsif (defined($str)) {
    $str =~ s/[\x00-\x7F]//g; # remove ASCII characters
    return 'en' if ($str eq '');

    my %handled;
    $str = Encode::decode_utf8($str) unless (Encode::is_utf8($str));
    foreach my $scr (@scrorder) {
      next if ($str !~ /\p{$scr}/);
      my $scrlangs = exists($cjkscr2lang{$scr}) ? $cjkscr2lang{$scr} : $scr2lang{$scr};
      foreach my $lang (@$scrlangs) {
        next if (exists($handled{$lang}));
        foreach my $enc (@{$lang2enc{$lang}}) {
          if (Encode::find_encoding($enc)) {
            my $scratch = $str;
            Encode::encode($enc, $scratch, Encode::FB_QUIET);
            return $lang if ($scratch eq '');
          }
        }
        $handled{$lang} = 1;
      }
    }
  } 
  return 'en';
}

# TEST 1: try conversion to use the specified charset. 
# TEST 2: try conversion to use Encode::Detect.
# TEST 3: try conversion to use Encode::Guess.
sub normalize_charset {
  my $str = shift;
  my $charset = shift;

  return wantarray ? ($str, 'ascii')  : $str unless ($str);

  my $decoded;
  my $detected;

  if ($charset) {
    ($decoded, $detected) = _specified_encoding($str, $charset);
  }
  unless ($detected) {
    ($decoded, $detected) = _encode_detect($str);
  }
  unless ($detected) {
    ($decoded, $detected) = _encode_guess($str);
  }
  unless ($detected) {
    return ($str, undef);
  }
  $decoded =~ s/^\x{feff}//g;
  $decoded = Encode::encode_utf8($decoded);

  # unfold hiragana, katakana and han
  if ($detected =~ /^(?:UTF|EUC|BIG5|GB|SHIFTJIS|ISO-2022|CP969$|CP932$|CP949|CP50221$)/i) {
    $decoded =~ s/($KANA_HAN_RE)\012($KANA_HAN_RE)/$1$2/og;
  }
  return wantarray ? ($decoded, $detected) : $decoded;
}

sub _specified_encoding {
  my $str = shift;
  my $encoding = shift;

  my $detected;
  my $decoded;

  return (undef, undef) unless ($encoding);

  # note: ISO-2022-* is not deistinguish from US-ASCII
  return (undef, undef) if ($str =~ /\e/ and $encoding !~ /^ISO-2022/i);

  # UTF-16|32 encoding without BOM cannot be trusted.
  return (undef, undef) if ($encoding =~ /^UTF-32$/i and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
  return (undef, undef) if ($encoding =~ /^UTF-16$/i and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);

  #$encoding = _get_alias($encoding);
  my $encoder = Encode::find_encoding($encoding);
  if (ref($encoder)) {
    $decoded = $encoder->decode($str,Encode::FB_QUIET);
    $detected = $encoder->name if ($str eq '');
  }
  return ($decoded, $detected);
}

sub _encode_detect {
  return undef unless HAS_ENCODE_DETECT;
  my $str = shift;

  # UTF-16|32 encoding without BOM cannot be trusted.
  return (undef, undef) if ($str =~ /\x00\x00/ and $str !~ /^(?:\xFF\xFE\x00\x00|\x00\x00\xFE\xFF)/);
  return (undef, undef) if ($str =~ /\x00/ and $str !~ /^(?:\xFF\xFE|\xFE\xFF)/);

  my $decoded;
  my $detected = Encode::Detect::Detector::detect($str);
  if ($detected) {
    $detected = _get_alias($detected);
    my $encoder = Encode::find_encoding($detected);
    if (ref($encoder)) {
      $decoded = $encoder->decode($str);
      $detected = $decoded ? $encoder->name : undef;
    }
    else {
      $detected = undef;
    }
  }
  return ($decoded, $detected);
}

sub _encode_guess {
  my $str = shift;

  my $detected;
  my $decoded;
  my $encoder;

  # Step 1: Examine ISO-2022-*.
  if ($str =~ /\e/) {
    $Encode::Guess::NoUTFAutoGuess = 1;
    $encoder = Encode::Guess::guess_encoding($str,
        qw/cp50221 7bit-jis iso-2022-kr/);
    $Encode::Guess::NoUTFAutoGuess = 0;
  }

  # Step 2: Examine US-ASCII/UTF-(8|16|32)
  unless (ref($encoder)) {
    $Encode::Guess::NoUTFAutoGuess = 0;
    $encoder = Encode::Guess::guess_encoding($str);
  }

  # Step 3: Examine other encodings
  unless (ref($encoder)) {
    $Encode::Guess::NoUTFAutoGuess = 1;
    eval {
      if ($str =~ /[\x80-\xFF]{4}/) {
        $encoder = Encode::Guess::guess_encoding($str,
          qw/euc-cn big5-eten euc-jp cp932 euc-kr cp949/);
      }
      else {
        $encoder = Encode::Guess::guess_encoding($str,
          qw/iso-8859-1 cp1252/);
      }
    };
    $Encode::Guess::NoUTFAutoGuess = 0;
  }
  if (ref($encoder)) {
    $detected = $encoder->name;
    if ($detected) {
      $decoded = $encoder->decode($str);
    }
  }
  return ($decoded, $detected);
}

sub _get_alias {
  my $encoding = shift;

  unless (HAS_ENCODE_HANEXTRA) {
    $encoding =~ s/^gb18030$/euc-cn/i;
  }
  $encoding =~ s/^unicode-1-1-(.+)$/$1/i;
  $encoding =~ s/^TIS-620$/iso-8859-11/i;
  $encoding =~ s/x-mac-(.+)$/Mac$1/i;
  $encoding =~ s/^Shift_JIS$/cp932/i;
  if (HAS_ENCODE_EUCJPMS) {
    $encoding =~ s/^iso-2022-jp$/cp50221/i;
    $encoding =~ s/^euc-jp$/cp51932/i;
  }

  return $encoding;
}


1;

