# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Tokenizer::SimpleZH - simple Chinese tokenizer

=head1 SYNOPSIS

loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::SimpleZH

=head1 DESCRIPTION

This plugin simply tokenizes a Chinese string.

=cut

package Mail::SpamAssassin::Plugin::Tokenizer::SimpleZH;

use strict;
use warnings;
use Mail::SpamAssassin::Plugin::Tokenizer;
use Lingua::ZH::MMSEG;
use Encode::HanConvert;

use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);

our $language = 'zh';

sub new {
  my $class = shift;
  my $mailsaobject = shift;

  $class = ref($class) || $class;
  my $self = $class->SUPER::new($mailsaobject, $language);
  bless ($self, $class);

  return $self;
}

sub tokenize {
  my $self = shift;
  my $text_array = shift;

  my @tokenized_array;
  my @tokens;
  my $line;
  foreach my $text (@$text_array) {
    next unless ($text);
	Encode::_utf8_on($text);
	$text = simp_to_trad($text);
	@tokens = mmseg($text);
	if (0 < scalar(@tokens)) {
		$line = ' '. join(' ', @tokens) . ' ';
		push(@tokenized_array, $line);
	}
  }
  return \@tokenized_array;
}

1;

