package MailPlusServer::PhishingLinkChecker;

use strict;
use warnings;

use MailPlusServer::Log;
use MailPlusServer::Util;

my %CharToInternational = (
	160,'nbsp',
	161,'iexcl',
	162,'cent',
	163,'pound',
	164,'curren',
	165,'yen',
	166,'brvbar',
	167,'sect',
	168,'uml',
	169,'copy',
	170,'ordf',
	171,'laquo',
	172,'not',
	173,'shy',
	174,'reg',
	175,'macr',
	176,'deg',
	177,'plusmn',
	178,'sup2',
	179,'sup3',
	180,'acute',
	181,'micro',
	182,'para',
	183,'middot',
	184,'cedil',
	185,'sup1',
	186,'ordm',
	187,'raquo',
	188,'frac14',
	189,'frac12',
	190,'frac34',
	191,'iquest',
	192,'Agrave',
	193,'Aacute',
	194,'Acirc',
	195,'Atilde',
	196,'Auml',
	197,'Aring',
	198,'AElig',
	199,'Ccedil',
	200,'Egrave',
	201,'Eacute',
	202,'Ecirc',
	203,'Euml',
	204,'Igrave',
	205,'Iacute',
	206,'Icirc',
	207,'Iuml',
	208,'ETH',
	209,'Ntilde',
	210,'Ograve',
	211,'Oacute',
	212,'Ocirc',
	213,'Otilde',
	214,'Ouml',
	215,'times',
	216,'Oslash',
	217,'Ugrave',
	218,'Uacute',
	219,'Ucirc',
	220,'Uuml',
	221,'Yacute',
	222,'THORN',
	223,'szlig',
	224,'agrave',
	225,'aacute',
	226,'acirc',
	227,'atilde',
	228,'auml',
	229,'aring',
	230,'aelig',
	231,'ccedil',
	232,'egrave',
	233,'eacute',
	234,'ecirc',
	235,'euml',
	236,'igrave',
	237,'iacute',
	238,'icirc',
	239,'iuml',
	240,'eth',
	241,'ntilde',
	242,'ograve',
	243,'oacute',
	244,'ocirc',
	245,'otilde',
	246,'ouml',
	247,'divide',
	248,'oslash',
	249,'ugrave',
	250,'uacute',
	251,'ucirc',
	252,'uuml',
	253,'yacute',
	254,'thorn',
	255,'yuml'
);

# Turn any character into an international version of it if it is in the range
# 160 to 255.
sub CharToIntnl {
	my $p = shift @_;
	# Passed in an 8-bit character.
	#print STDERR "Char in is $p\n";
	($a) = unpack 'C', $p;

	#print STDERR "Char is $a, $p\n";

	# Bash char 160 (space) to nothing
	return '' if $a == 160;
	my $char = $CharToInternational{$a};
	return '&' . $char . ';' if $char ne "";
	return $p;
}

# Like CharToIntnl but does entire string
sub StringToIntnl {
	my $original = shift;

	# Much faster char conversion for whole strings
	my(@newlinkurl, $newlinkurl, $char);
	@newlinkurl = unpack("C*", $original); # Get an array of characters
	foreach (@newlinkurl) {
		next if $_ == 160;
		$char = $CharToInternational{$_};
		if (defined $char) {
			$newlinkurl .= '&' . $char . ';';
		} else {
			$newlinkurl .= chr($_);
		}
	}
	return $newlinkurl;
	#$linkurl = $newlinkurl unless $newlinkurl eq "";
	#$linkurl =~ s/./CharToIntnl("$&")/ge; -- Old slow version
}


# Clean up a link URL so it is suitable for phishing detection
# Return (clean url, alarm trigger value). An alarm trigger value non-zero
# means this is definitely likely to be a phishing trap, no matter what
# anything else says.
sub CleanLinkURL {
	my($DisarmLinkURL, $DisarmBaseURL) = @_;

	use bytes;

	my($linkurl,$alarm);
	$alarm = 0;

	$linkurl = $DisarmLinkURL;
	$linkurl = lc($linkurl);
	$linkurl =~ s#%([0-9a-f][0-9a-f])#chr(hex('0x' . $1))#gei; # Unescape
	$linkurl = StringToIntnl($linkurl);
	return ("",0) unless $linkurl =~ /[.\/]/; # Ignore if it is not a website at all

	$linkurl =~ s/\s+//g; # Remove any whitespace
	$linkurl =~ s/\\/\//g; # Change \ to / as many browsers do this
	return ("",0) if $linkurl =~ /\@/ && $linkurl !~ /\//; # Ignore emails

	$linkurl =~ s/[,.]+$//; # Remove trailing dots, but also commas while at it
	$linkurl =~ s/^\[\d*\]//; # Remove leading [numbers]
	$linkurl =~ s/^blocked[:\/]+//i; # Remove "blocked::" labels
	$linkurl =~ s/^blocked[:\/]+//i; # And again, in case there are 2
	$linkurl =~ s/^blocked[:\/]+//i; # And again, in case there are 3
	$linkurl =~ s/^blocked[:\/]+//i; # And again, in case there are 4
	$linkurl =~ s/^outbind:\/\/\d+\//http:\/\//i; # Remove "outbind://22/" type labels

	$linkurl = $DisarmBaseURL . '/' . $linkurl
		if $linkurl ne "" && $DisarmBaseURL ne "" &&
			$linkurl !~ /^(https?|ftp|mailto|webcal):/i;

	$linkurl =~ s/^(https?:\/\/[^:]+):80($|\D)/$1/i; # Remove http://....:80
	$linkurl =~ s/^(https?|ftp|webcal)[:;]\/\///i;
	return ("",0) if $linkurl =~ /^ma[il]+to[:;]/i;

	$linkurl =~ s/[?\/].*$//; # Only compare up to the first '/' or '?'
	$linkurl =~ s/(\<\/?(br|p|ul)\>)*$//ig; # Remove trailing br, p, ul tags
	return ("",0) if $linkurl =~ /^file:/i; # Ignore file: URLs completely

	return ("",0) if $linkurl =~ /^#/; # Ignore internal links completely

	$linkurl =~ s/\/$//; # LinkURL is trimmed -- note
	$linkurl =~ s/:80$//; # Port 80 is the default anyway
	$alarm = 1 if $linkurl =~ s/[\x00-\x1f[:^ascii:]]/_BAD_/g; # /\&\#/;
	$linkurl = 'JavaScript' if $linkurl =~ /^javascript:/i;
	($linkurl, $alarm);
}

sub FetchPhishingLink {
	my($DisarmLinkText, $DisarmLinkURL, $DisarmBaseURL, $PhishingWarningText) = @_;
	my($squashedtext,$linkurl,$alarm,$numbertrap);

	$squashedtext = lc($DisarmLinkText);

	# Try to filter out mentions of Microsoft's .NET system
	$squashedtext = "" if $squashedtext eq ".net";
	$squashedtext = "" if $squashedtext =~ /(^|\b)(ado|asp)\.net($|\b)/;

	$squashedtext =~ s/\%a0//g;
	$squashedtext =~ s#%([0-9a-f][0-9a-f])#chr(hex('0x' . $1))#gei; # Unescape
	$squashedtext =~ s/\\/\//g; # Change \ to / as many browsers do this
	$squashedtext =~ s/^\[\d*\]//; # Removing leading [numbers]
	$squashedtext =~ tr/\n/ /; # Join multiple lines onto 1 line
	$squashedtext =~ s/(\<\/?[a-z][a-z0-9:._-]*((\s+[a-z][a-z0-9:._-]*(\s*=\s*(?:\".*?\"|\'.*?\'|[^\'\">\s]+))?)+\s*|\s*)\/?\>)*//ig; # Remove tags, better re from snifer_@hotmail.com
	$squashedtext =~ s/(?:(?<=^)|(?<=\s)|(?<=&nbsp;))(?:(?!&nbsp;))[^.\s]+(?:(?<!&nbsp;))(?:(?=$|\s|&nbsp;))//g; # Remove text sequence that's not domain (contains no dot)
	$squashedtext =~ s/(?:(?<=^)|(?<=\s)|(?<=&nbsp;))(?:(?!&nbsp;))\.[^\s]+(?:(?<!&nbsp;))(?:(?=$|\s|&nbsp;))//g; # Remove text sequence that's not domain (contains leading dot)
	$squashedtext =~ s/(?:(?<=^)|(?<=\s)|(?<=&nbsp;))(?:(?!&nbsp;))[^\s]+\.(?:(?<!&nbsp;))(?:(?=$|\s|&nbsp;))//g; # Remove text sequence that's not domain (contains trailing dot)
	$squashedtext =~ s/\s+//g; # Remove any whitespace
	$squashedtext =~ s/^[^\/:]+\@//; # Remove username of email addresses
	$squashedtext =~ s/^.*(\&lt\;|\<)((https?|ftp|mailto|webcal):.+?)(\&gt\;|\>).*$/$2/i; # Turn blah-blah <http://link.here> blah-blah into "http://link.here"
	$squashedtext =~ s/^\&lt\;//g; # Remove leading &lt;
	$squashedtext =~ s/\&gt\;$//g; # Remove trailing &gt;
	$squashedtext =~ s/\&lt\;/\</g; # Remove things like &lt; and &gt;
	$squashedtext =~ s/\&gt\;/\>/g; # rEmove things like &lt; and &gt;
	$squashedtext =~ s/\&nbsp\;//g; # Remove fixed spaces
	$squashedtext =~ s/^(http:\/\/[^:]+):80(\D|$)/$1$2/i; # Remove http:...:80
	$squashedtext =~ s/^(https:\/\/[^:]+):443(\D|$)/$1$2/i; # Remove https:...:443
	$squashedtext =  StringToIntnl($squashedtext); # s/./CharToIntnl("$&")/ge;
	# If it looks like a link, remove any leading https:// or ftp://
	($linkurl,$alarm) = CleanLinkURL($DisarmLinkURL, $DisarmBaseURL);

	# Has it fallen foul of the numeric-ip phishing net? Must treat x
	# like a digit so it catches 0x41 (= 'A')
	$numbertrap = ($linkurl !~ /[<>g-wyz]+/) ? 1 : 0;

	if ($alarm ||
		$squashedtext =~ /^(w+|ft+p|fpt+|ma[il]+to)([.,]|\%2e)/i || 
		$squashedtext =~ /[.,](com|org|net|info|biz|ws)/i ||
		$squashedtext =~ /[.,]com?[.,][a-z][a-z]/i ||
		$squashedtext =~ /^(ht+ps?|ft+p|fpt+|mailto|webcal)[:;](\/\/)?(.*(\.|\%2e))/i ||
		$numbertrap) {

		$squashedtext =~  s/^(ht+ps?|ft+p|fpt+|mailto|webcal)[:;](\/\/)?(.*(\.|\%2e))/$3/i;
		$squashedtext =~ s/^.*?-http:\/\///; # 20080206 Delete common pre-pended text
		$squashedtext =~ s/\/.*$//; # Only compare the hostnames
		$squashedtext =~ s/[,.]+$//; # Allow trailing dots and commas
		$squashedtext = 'www.' . $squashedtext
		unless $squashedtext =~ /^ww+|ft+p|fpt+|mailto|webcal/ || $numbertrap;

		# If we have already tagged this link as a phishing attack, spot the
		# warning text we inserted last time and don't tag it again.
		my $squashedpossible = lc($PhishingWarningText);
		my $squashedsearch   = lc($DisarmLinkText);
		$squashedpossible =~ s/\s//g;
		$squashedpossible =~ s/(\<\/?[^>]*\>)*//ig; # Remove tags
		$squashedsearch   =~ s/\s//g;
		$squashedsearch   =~ s/(\<\/?[^>]*\>)*//ig; # Remove tags
		$squashedpossible = quotemeta($squashedpossible);
		if ($squashedtext =~ /$squashedpossible/) {
			return '';
		}

		#
		# Strict Phishing Net Goes Here
		#
		if ($alarm ||
			($linkurl ne "" && $squashedtext !~ /^(w+\.)?\Q$linkurl\E\/?$/)
			|| ($linkurl ne "" && $numbertrap)) {
			if ($linkurl eq "" || !$numbertrap || $linkurl ne $squashedtext) {
				return $linkurl;
			}
		}
	}

	return '';
}

1;
