package MailPlusServer::ContentScanner;

use strict;
use warnings;

use Encode;
use MIME::Base64;
use HTML::TokeParser;
use HTML::TokeParser::Simple;
use HTML::Parser;
use MailPlusServer::Log;
use MailPlusServer::Util;
use MailPlusServer::PhishingLinkChecker;

$MailPlusServer::ContentScanner::YES    = 'yes';
$MailPlusServer::ContentScanner::NO     = 'no';
$MailPlusServer::ContentScanner::DISARM = 'disarm';

sub new {
	my $class = shift;
	my $config = shift;
	my $self = {};

	$self->{enable_dangerous_content_scan}  = $config->{enable_dangerous_content_scan};
	$self->{reject_partial_message} = $config->{reject_partial_message};
	$self->{reject_external_message_bodies} = $config->{reject_external_message_bodies};
	$self->{convert_html_to_text} = $config->{convert_html_to_text};
	$self->{find_phishing_fraud} = $config->{find_phishing_fraud};
	$self->{allow_iframe_tags} = $config->{allow_iframe_tags};
	$self->{allow_form_tags} = $config->{allow_form_tags};
	$self->{allow_script_tags} = $config->{allow_script_tags};
	$self->{allow_webbugs} = $config->{allow_webbugs};
	$self->{allow_object_codebase_tags} = $config->{allow_object_codebase_tags};

	$self->{need_to_scan_html} = 0;
	if ($self->{find_phishing_fraud} ||
		$MailPlusServer::ContentScanner::YES ne $self->{allow_iframe_tags} ||
		$MailPlusServer::ContentScanner::YES ne $self->{allow_form_tags} ||
		$MailPlusServer::ContentScanner::YES ne $self->{allow_script_tags} ||
		$MailPlusServer::ContentScanner::YES ne $self->{allow_object_codebase_tags} ||
		$MailPlusServer::ContentScanner::YES ne $self->{allow_webbugs}) {
		$self->{need_to_scan_html} = 1;
	}

	bless ($self, $class);
}

sub is_partial_message {
	my ($self, $entity) = @_;
	my $type = $entity->head->mime_attr('content-type');

	return ($type && lc($type) eq 'message/partial') ? 1 : 0;
}

sub is_external_message {
	my ($self, $entity) = @_;
	my $type = $entity->head->mime_attr('content-type');

	return ($type && lc($type) eq 'message/external-body') ? 1 : 0;
}

sub is_html_message {
	my ($self, $entity) = @_;

	my $content_type = $entity->head->mime_attr('content-type');
	my $content_disposition = $entity->head->mime_attr('content-disposition');

	if ((defined($content_type) && $content_type =~ /text\/html/i) &&
		(!defined($content_disposition) || $content_disposition !~ /attachment/i)) {
		return 1;
	}

	return 0;
}

sub scan_mimetype {
	my ($self, $entity) = @_;

	return ($MailPlusServer::Util::ACCEPT, '') unless $self->{enable_dangerous_content_scan};

	if ($self->{reject_partial_message} and $self->is_partial_message($entity)) {
		return ($MailPlusServer::Util::REJECT, 'MIME type message/partial not accepted here');
	}

	if ($self->{reject_external_message_bodies} and $self->is_external_message($entity)) {
		return ($MailPlusServer::Util::REJECT, 'MIME type message/external-body not accepted here');
	}

	return ($MailPlusServer::Util::ACCEPT, '');
}

sub scan {
	my ($self, $entity) = @_;

	return $MailPlusServer::Util::ACCEPT unless $self->{enable_dangerous_content_scan};

	if ($self->{convert_html_to_text}) {
		if ($self->strip_html($entity) > 0) {
			return $MailPlusServer::Util::ACCEPT_REBUILD;
		}
		return $MailPlusServer::Util::ACCEPT;
	}

	# Scan html to disarm tags and do phishing detection
	if ($self->{need_to_scan_html}) {
		my ($action, $report) = $self->scan_html($entity);
		return ($action, $report);
	}

	return $MailPlusServer::Util::ACCEPT;
}

sub scan_html_entity {
	my ($self, $entity) = @_;

	my($htmlname_old, $htmlname_new, $fh);
	$htmlname_old = $entity->bodyhandle->path();
	$htmlname_new = $htmlname_old;
	$htmlname_new =~ s/\..?html?$//i;
	$htmlname_new .= '_disarm.html';

	$fh = new FileHandle;
	unless ($fh->open(">$htmlname_new")) {
		MailPlusServer::Log::ErrorLog('Could not create file %s', $htmlname_new);
		return;
	}

	my $parser = HTML::TokeParser::Simple->new($htmlname_old);
	my ($action, $report) = ($MailPlusServer::Util::ACCEPT, '');

	# for phishing link check
	my $link_checking = 0;
	my $link_text     = '';
	my $link_url      = '';
	my $link_base_url = '';
	# FIXME: need to change warning text ?
	my $link_warning_start = '<font color="red"><b>MailPlus Server has detected a possible fraud attempt from';
	my $link_warning_end   = 'claiming to be</b></font>';
	my $object_disarming = 0;

	while (my $token = $parser->get_token()) {
		if ($token->is_tag('iframe')) {
			if ($self->{allow_iframe_tags} eq $MailPlusServer::ContentScanner::NO) {
				($action, $report) = ($MailPlusServer::Util::REJECT, 'iframe html tag is not allowed');
				last;
			}

			if ($self->{allow_iframe_tags} eq $MailPlusServer::ContentScanner::DISARM) {
				print $fh '<iframe_disarmed>'  if $token->is_start_tag('iframe');
				print $fh '</iframe_disarmed>' if $token->is_end_tag('iframe');
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
				next;
			}
		}

		if ($token->is_tag('form')) {
			if ($self->{allow_form_tags} eq $MailPlusServer::ContentScanner::NO) {
				($action, $report) = ($MailPlusServer::Util::REJECT, 'form html tag is not allowed');
				last;
			}

			if ($self->{allow_form_tags} eq $MailPlusServer::ContentScanner::DISARM) {
				print $fh '<form_disarmed>'  if $token->is_start_tag('form');
				print $fh '</form_disarmed>' if $token->is_end_tag('form');
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
				next;
			}
		}

		if ($token->is_tag('script')) {
			if ($self->{allow_script_tags} eq $MailPlusServer::ContentScanner::NO) {
				($action, $report) = ($MailPlusServer::Util::REJECT, 'script html tag is not allowed');
				last;
			}

			if ($self->{allow_script_tags} eq $MailPlusServer::ContentScanner::DISARM) {
				print $fh '<script_disarmed>'  if $token->is_start_tag('script');
				print $fh '</script_disarmed>' if $token->is_end_tag('script');
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
				next;
			}
		}

		if ($token->is_start_tag('object')) {
			my $attr_href = $token->get_attr();
			my $is_object_codebase = exists $attr_href->{'codebase'} || exists $attr_href->{'data'};

			if (!$is_object_codebase) {
				print $fh $token->as_is();
				next;
			}

			if ($self->{allow_object_codebase_tags} eq $MailPlusServer::ContentScanner::NO) {
				($action, $report) = ($MailPlusServer::Util::REJECT, 'object html tag is not allowed');
				last;
			}

			if ($self->{allow_object_codebase_tags} eq $MailPlusServer::ContentScanner::DISARM) {
				print $fh '<object_disarmed>'  if $token->is_start_tag('object');
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
				$object_disarming = 1;
				next;
			}
		} elsif ($token->is_end_tag('object') && $object_disarming) {
			print $fh '</object_disarmed>';
			$object_disarming = 0;
			next;
		}

		if ($token->is_start_tag('img')) {
			my $attr_href = $token->get_attr();
			my $is_webbug = exists $attr_href->{'width'} && $attr_href->{'width'} <= 2 &&
				exists $attr_href->{'height'} && $attr_href->{'height'} <= 2 &&
				exists $attr_href->{'src'} && $attr_href->{'src'} !~ /^cid:/i;

			if (!$is_webbug) {
				print $fh $token->as_is();
				next;
			}

			if ($self->{allow_webbugs} eq $MailPlusServer::ContentScanner::NO) {
				($action, $report) = ($MailPlusServer::Util::REJECT, 'web bug is not allowed');
				last;
			}

			if ($self->{allow_webbugs} eq $MailPlusServer::ContentScanner::DISARM) {
				my $spacer_img_base64 = "R0lGODlhAQABAJEAAP///wAAAAAAAAAAACH5BAkAAAAALAAAAAABAAEAAAgEAAEEBAA7";
                print $fh "<img src='data:image/gif;base64, $spacer_img_base64' " .
						'width="' . $attr_href->{'width'} . '" '.
						'height="' . $attr_href->{'height'} . '" '.
						'alt="Web Bug from ' . $attr_href->{'src'} . '" />';
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
				next;
			}
		}

		if ($self->{'find_phishing_fraud'} && $token->is_start_tag('base')) {
			my $attr_href = $token->get_attr();

			if (exists($attr_href->{'href'})) {
				$link_base_url = $attr_href->{'href'};
			}

			print $fh $token->as_is();
			next;
		}

		if ($self->{'find_phishing_fraud'} && $token->is_start_tag('a')) {
			my $attr_href = $token->get_attr();

			print $fh $token->as_is();

			if (!exists($attr_href->{'href'})) {
				next;
			}

			$link_checking = 1;
			$link_text = '';
			$link_url  = $attr_href->{'href'};
			next;
		}

		if ($self->{'find_phishing_fraud'} && $token->is_text() && $link_checking) {
			$link_text .= $token->as_is();
			next;
		}

		if ($self->{'find_phishing_fraud'} && $token->is_end_tag('a') && $link_checking) {
			my $phishing_link = MailPlusServer::PhishingLinkChecker::FetchPhishingLink($link_text, $link_url, $link_base_url, $link_warning_start);

			if ($phishing_link ne '') {
				print $fh "$link_warning_start \"$phishing_link\" $link_warning_end $link_text";
				$action = $MailPlusServer::Util::ACCEPT_REBUILD;
			} else {
				print $fh "$link_text";
			}

			print $fh $token->as_is();

			$link_checking = 0;
			$link_text = '';
			$link_url  = '';
			next;
		}

		print $fh $token->as_is();
	}

	$fh->close();

	if ($action == $MailPlusServer::Util::ACCEPT_REBUILD) {
		$entity->bodyhandle->path($htmlname_new);
		return $MailPlusServer::Util::ACCEPT_REBUILD;
	}

	return ($action, $report);
}

# FIXME: what to do if too many recursive parts ?
sub scan_html {
	my ($self, $entity) = @_;
	my ($action, $report) = ($MailPlusServer::Util::ACCEPT, '');

	return $MailPlusServer::Util::ACCEPT unless $entity && defined($entity->head);

	if ($self->is_html_message($entity)) {
		($action, $report) = $self->scan_html_entity($entity);

		if ($action == $MailPlusServer::Util::REJECT) {
			return ($action, $report);
		}
	}

	my(@parts, $part);
	@parts = $entity->parts;
	foreach $part (@parts) {
		my ($part_action, $part_report) = $self->scan_html($part);

		if ($part_action == $MailPlusServer::Util::REJECT) {
			return ($part_action, $part_report);
		}

		if ($part_action > $action) {
			$action = $part_action;
		}
	}

	return $action;
}

# Rewritten from MailScanner HTMLEntityToText()
#
# Convert 1 MIME entity from html to text using HTML::Parser.
sub convert_html_entity_to_text {
	my($this, $entity) = @_;

	my($htmlname, $textname, $textfh, $htmlparser);

	# Replace the MIME Content-Type
	$entity->head->mime_attr('Content-type' => 'text/plain');

	# Replace the filename with a new one
	$htmlname = $entity->bodyhandle->path();
	$textname = $htmlname;
	$textname =~ s/\..?html?$//i; # Remove .htm .html .shtml
	$textname .= '.txt'; # This should always pass the filename checks
	$entity->bodyhandle->path($textname);

	# Create the new file with the plain text in it
	$textfh = new FileHandle;
	unless ($textfh->open(">$textname")) {
		MailPlusServer::Log::ErrorLog('Could not create plain text file %s', $textname);
		return;
	}
	$htmlparser = HTML::TokeParser::MailScanner->new($htmlname);
	# Turn links into text containing the URL
	$htmlparser->{textify}{a} = 'href';
	$htmlparser->{textify}{img} = 'src';

	my $inscript = 0;
	my $instyle  = 0;
	while (my $token = $htmlparser->get_token()) {
		next if $token->[0] eq 'C';
		# Don't output the contents of style or script sections
		if ($token->[1] =~ /style/i) {
			$instyle = 1 if $token->[0] eq 'S';
			$instyle = 0 if $token->[0] eq 'E';
			next if $instyle;
		}
		if ($token->[1] =~ /script/i) {
			$inscript = 1 if $token->[0] eq 'S';
			$inscript = 0 if $token->[0] eq 'E';
			next if $inscript;
		}
		my $text = $htmlparser->get_trimmed_text();
		print $textfh $text . "\n" if $text;
	}
	$textfh->close();
}

# Rewritten from mimedefang remove_redundant_html_parts()
sub remove_redundant_html_parts {
	my ($self, $entity) = @_;
	my @parts = $entity->parts;
	my $type = lc($entity->mime_type);

	# Don't recurse into multipart/signed or multipart/encrypted
	return 0 if ($type eq "multipart/signed" or $type eq "multipart/encrypted");

	my (@keep, $part);
	my $didsomething = 0;
	my $have_text_plain = 0;
	if ($type eq "multipart/alternative" && $#parts >= 0) {
		# First look for a text/plain part
		$have_text_plain = 0;
		foreach $part (@parts) {
			$type = lc($part->mime_type);
			if ($type eq "text/plain") {
				$have_text_plain = 1;
				last;
			}
		}

		# If we have a text/plain part, delete any text/html part
		if ($have_text_plain) {
			foreach $part (@parts) {
				$type = lc($part->mime_type);
				if ($type ne "text/html") {
					push(@keep, $part);
				} else {
					$didsomething = 1;
				}
			}
			if ($didsomething) {
				$entity->parts(\@keep);
				@parts = @keep;
			}
		}
	}
	if ($#parts >= 0) {
		foreach $part (@parts) {
			$didsomething = 1 if ($self->remove_redundant_html_parts($part));
		}
	}
	return $didsomething;
}

# Rewritten from MailScanner HTMLToText()
sub strip_html_helper {
	my ($self, $entity) = @_;
	my $counter = 0;

	return 0 unless $entity && defined($entity->head);

	if ($self->is_html_message($entity)) {
		$self->convert_html_entity_to_text($entity);
		$counter++;
	}

	my(@parts, $part);
	@parts = $entity->parts;
	foreach $part (@parts) {
		$counter += $self->strip_html_helper($part);
	}

	return $counter;
}

sub strip_html {
	my ($self, $entity) = @_;
	my $counter = 0;

	$counter += $self->remove_redundant_html_parts($entity);
	$counter += $self->strip_html_helper($entity);

	return $counter;
}

# Copied from MailScanner
#
# This is an improvement to the default HTML-Parser routine for getting
# the text out of an HTML message. The only difference to their one is
# that I join the array of items together with spaces rather than "".
#
package HTML::TokeParser::MailScanner;

use HTML::Entities qw(decode_entities);

use vars qw(@ISA);
@ISA = qw(HTML::TokeParser);

sub get_text
{
	my $self = shift;
	my $endat = shift;
	my @text;
	while (my $token = $self->get_token) {
		my $type = $token->[0];
		if ($type eq "T") {
			my $text = $token->[1];
			decode_entities($text) unless $token->[2];
			push(@text, $text);
		} elsif ($type =~ /^[SE]$/) {
			my $tag = $token->[1];
			if ($type eq "S") {
				if (exists $self->{textify}{$tag}) {
					my $alt = $self->{textify}{$tag};
					my $text;
					if (ref($alt)) {
						$text = &$alt(@$token);
					} else {
						$text = $token->[2]{$alt || "alt"};
						$text = "[\U$tag]" unless defined $text;
					}
					push(@text, $text);
					next;
				}
			} else {
				$tag = "/$tag";
			}
			if (!defined($endat) || $endat eq $tag) {
				$self->unget_token($token);
				last;
			}
		}
	}
	# JKF join("", @text);
	join(" ", @text);
}

1;
