# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

=head1 NAME

Tokenizer::MeCab - Japanese tokenizer with MeCab

=head1 SYNOPSIS

loadplugin     Mail::SpamAssassin::Plugin::Tokenizer::MeCab

=head1 DESCRIPTION

This plugin tokenizes a Japanese string with MeCab that is 
the morphological analysis engine. 

Text::MeCab 0.12 or over is required.

=cut

package Mail::SpamAssassin::Plugin::Tokenizer::MeCab;

use strict;
use warnings;
use Mail::SpamAssassin::Plugin::Tokenizer;

use vars qw(@ISA);
@ISA = qw(Mail::SpamAssassin::Plugin::Tokenizer);

# Have to do this so that RPM doesn't find these as required perl modules
BEGIN { require MeCab; }
our $language = 'ja';
our $mecab = new MeCab::Tagger(-Ochasen);

sub new {
  my $class = shift;
  my $mailsaobject = shift;

  $class = ref($class) || $class;
  my $self = $class->SUPER::new($mailsaobject, $language);
  bless ($self, $class);

  return $self;
}

sub tokenize {
  my $self = shift;
  my $text_array = shift;

  my @tokenized_array;
  foreach my $text (@$text_array) {
    next unless ($text);
    $text =~ s/([\x80-\xFF]{3,})/&_tokenize($1)/eg;
    push(@tokenized_array, $text);
  }
  return \@tokenized_array;
}

sub _tokenize {
  my $text = shift;

  my @buf;
  for (my $node = $mecab->parseToNode($text); $node->{next}; $node = $node->{next}) {
    push(@buf, $node->{surface});
  }
  my $tokenized = join(' ', @buf) . ' ';
  return $tokenized;
}

1;

