#
# Bayes Spam Filter
#	ベイズ理論を用いたコメントスパムフィルター。
#
# 2011.04.24
#	・設定が保存出来なくなっていた不具合を修正。 
#	・デバッグログに表示する情報を詳細にした。
#	・分かち書きの処理を多少改善した。
#	・１文字のキーワードも保存するようにした。
#	・スパム判定されてしまったハムコメントを気づかずにいることがあるため判定結果をメールで送信するようにした。
#	・とりあえずトラックバックには機能しないようにした。
#
#参考：http://www.sixapart.jp/movabletype/manual/object_reference/#archive
#
package MT::Plugin::BayesSpamFilter;

use strict;
use base 'MT::Plugin';
use MT;
use MT::I18N;
use MT::Template::Context;
use MT::JunkFilter qw(ABSTAIN);
use MT::Entry;
use MT::Mail;
use Encode;
use Algorithm::NaiveBayes;
use bsf_wakati;

use constant VERSION => '0.20';

my $plugin = MT::Plugin::BayesSpamFilter->new({
	name => 'Bayse Spam Filter',
	description => "ベイズ理論を利用したスパムフィルタです。",
	version => VERSION,
	doc_link => 'http://blog.kumacchi.com/2011/01/movale_type.html',
	blog_config_template => \&blog_config_template,
	settings => new MT::PluginSettings([
		['spam_score',             { Default => '-1' }],
		['mail_send_spam',         { Default => 1    }],
		['mail_send_ham',                             ],
		['mail_send_unclassified', { Default => 1    }],
		['mail_address'                               ],

	]),
	author_name => 'kumacchi',
	author_link => 'http://blog.kumacchi.com/'
});
MT->add_plugin($plugin);
MT->register_junk_filter({ name => $plugin->name, plugin => $plugin, code => sub { $plugin->score(@_) },});

sub score{
	my ($plugin, $obj) = @_;

	&debug_print("----------\n");

	my $entry = '';
	my $ip = '';
	my $permalink = '';
	my $id = '';
	if(eval{$obj->isa('MT::TBPing')}){
		&debug_print("MT::TBPing\n");
		return (ABSTAIN, '未判別(unclassified)');

		eval{&debug_print("id:".$obj->id."\n");};
		eval{&debug_print("blog_id:".$obj->blog_id."\n");};
		eval{$ip = $obj->ip; &debug_print("ip:$ip\n");};
		eval{&debug_print("tb_id:".$obj->tb_id."\n");};
		eval{&debug_print("title:".$obj->title."\n");};
		eval{&debug_print("excerpt:".$obj->excerpt."\n");};
		eval{&debug_print("source_url:".$obj->source_url."\n");};
		eval{&debug_print("blog_name:".$obj->blog_name."\n");};
		eval{&debug_print("created_on:".$obj->created_on."\n");};
		eval{&debug_print("modified_on:".$obj->modified_on."\n");};
		$entry = MT::Entry->load( $obj->tb_id );
	}

	if(eval{$obj->isa('MT::Comment')}){
		&debug_print("MT::Comment\n");
		eval{&debug_print("id:".$obj->id."\n");};
		eval{&debug_print("blog_id:".$obj->blog_id."\n");};
		eval{$ip = $obj->ip; &debug_print("ip:$ip\n");};
		eval{&debug_print("entry_id:".$obj->entry_id."\n");};
		eval{&debug_print("author:".$obj->author."\n");};
		eval{&debug_print("commenter_id:".$obj->commenter_id."\n");};
		eval{&debug_print("email:".$obj->email."\n");};
		eval{&debug_print("url:".$obj->url."\n");};
#		eval{&debug_print("text:".$obj->text."\n");};
		eval{&debug_print("visible:".$obj->visible."\n");};
		eval{&debug_print("created_on:".$obj->created_on."\n");};
		eval{&debug_print("modified_on:".$obj->modified_on."\n");};
		$entry = MT::Entry->load( $obj->entry_id );
	}

	eval{&debug_print("title:".$entry->title."\n");};
	
	eval{$permalink = $entry->permalink; &debug_print("permalink:$permalink\n");};
#	eval{&debug_print("archive_file:".$entry->archive_file."\n");};
#	eval{&debug_print("archive_url:".$entry->archive_url."\n");};


#	&debug_print("all_text:".$obj->all_text."\n");

	my $all_text = $obj->all_text;

	my $charset = MT::ConfigMgr->instance->PublishCharset;
	$all_text = MT::I18N::encode_text($all_text, $charset, 'utf8');

	&debug_print("\nORIGINAL: $all_text\n");

	my $spam_score             = $plugin->get_config_value('spam_score',             'blog:' . $obj->blog_id) || -1;
	my $mail_send_spam         = $plugin->get_config_value('mail_send_spam',         'blog:' . $obj->blog_id) || -1;
	my $mail_send_ham          = $plugin->get_config_value('mail_send_ham',          'blog:' . $obj->blog_id) || -1;
	my $mail_send_unclassified = $plugin->get_config_value('mail_send_unclassified', 'blog:' . $obj->blog_id) || -1;
	my $mail_address           = $plugin->get_config_value('mail_address',           'blog:' . $obj->blog_id) || -1;

	my $nb = Algorithm::NaiveBayes->new;
	
	my %ham = &getHash($plugin->envelope."/ham/_ham_log.cgi");
	if(%ham){
		$nb->add_instance(attributes => {%ham},	label => 'ham');
	}else{
	}

	my %spam = &getHash($plugin->envelope."/spam/_spam_log.cgi");
	if(%spam){
		$nb->add_instance(attributes => {%spam},label => 'spam');
	}else{
	}

	$nb->train;

	my $wakati = &bsf_wakati::wakati($all_text);

	&debug_print("\nWAKATI: $wakati\n");

	my %comment = &makeHash($wakati);
	my $result = $nb->predict(attributes => {%comment});


	my $cnt=0;
	foreach my $key(sort { ${$result}{$b} <=> ${$result}{$a} } keys %{$result}){
		$cnt++;
		&debug_print(sprintf("%02d %4s = %s\n",$cnt,$key,${$result}{$key}));
	}

my $body=<<"EOF";
$all_text

コメントの付いた記事のURL:
$permalink

コメント投稿者のIP:
$ip
EOF



	if($result->{spam} > $result->{ham}){
		&debug_print("スパム判定\n");
		if($mail_send_spam eq '1'){&mail_send($mail_address,'spam',$body."\n判定: スパム(spam)");}
		return ($spam_score,'スパム(spam)');
	}
	elsif($result->{spam} < $result->{ham}){
		&debug_print("ハム判定\n");
		if($mail_send_ham eq '1'){&mail_send($mail_address,'ham',$body."\n判定: ハム(ham)");}
		return (1,'ハム(ham)');
	}
	else{
		&debug_print("未判定(unclassified)\n");
		if($mail_send_unclassified eq '1'){&mail_send($mail_address,'unclassified',$body."\n判定: 未判別(unclassified)");}
		return (ABSTAIN, '未判別(unclassified)');
	}
}

sub mail_send{
	my ($mail_address,$result,$all_text) = @_;

#	my %head = ( To => 'foo@bar.com', Subject => 'My Subject' );
	my %head = ( To => $mail_address, Subject => 'Bayse Spam Filter 通知メール' );
#	my $body = 'This is the body of the message.';
	my $body = $all_text;
#	eval{
#		MT::Mail->send(\%head, $body) || die MT::Mail->errstr;
#	};

	unless(MT::Mail->send(\%head, $body)){
		&debug_print("MT::Mail:errstr = ".MT::Mail->errstr);
		die MT::Mail->errstr;
	}
}

sub blog_config_template {
	my $plugin = shift;
	my ($param,  $scope) = @_;

	&debug_print("設定画面表示 $scope\n");

my $str =<<EOF;
<p>
ベイズ理論によってコメントスパム・トラックバックスパムを判定します。
</p>
<p>
スパム又はハムをプラグインに学習させることが出来ます。学習させるほど判定の正確さが増します。スパムの場合はスパムのエリアにスパムの文章を貼り付けて[変更を保存]をクリック。正常な文章がスパムと誤って認識された場合はその文章をハムに貼りつけて[変更を保存]をクリックして下さい。
</p>
・ハム(ham)<br />
<textarea name="ham" style="width:400px; height:100px;"></textarea><br />
・スパム(spam)<br /><br />
<textarea name="spam" style="width:400px; height:100px;"></textarea><br />
</p>
<p>スパム判定時のスコア。</p>
<p>
・スコア <input type="text" size="3" name="spam_score" value="<TMPL_VAR NAME=SPAM_SCORE ESCAPE=HTML>" />
</p>
<p>●通知メールの設定</p>
<p>判定結果を指定したメールアドレスに送信するかどうかの設定です。ご認識されてしまった通常のコメントを救済するために通常は設定しておいたほうがいいと思います。</p>
<p>
<input type="checkbox" name="mail_send_spam" value="1" <TMPL_IF NAME=MAIL_SEND_SPAM>checked="checked" </TMPL_IF>/>スパム判定時に通知メールを送信する。<br />
<input type="checkbox" name="mail_send_ham" value="1" <TMPL_IF NAME=MAIL_SEND_HAM>checked="checked" </TMPL_IF>/>ハム判定時に通知メール送信する。<br />
<input type="checkbox" name="mail_send_unclassified" value="1" <TMPL_IF NAME=MAIL_SEND_UNCLASSIFIED>checked="checked" </TMPL_IF>/>未判定時に通知メールを送信する。 <br />
</p>
<p>通知先メールアドレス</p>
<p>
 <input type="text" size="50" name="mail_address" value="<TMPL_VAR NAME=MAIL_ADDRESS ESCAPE=HTML>" />
</p>
EOF

#	$str = Encode::encode_utf8($str);
	$str;
}



sub save_config {
	my $plugin = shift;
	my ($param, $scope) = @_;

	$param->{ham} =~ s/　/ /g;
	$param->{ham} =~ s/^\s+//;
	$param->{ham} =~ s/\s+$//;

	$param->{spam} =~ s/　/ /g;
	$param->{spam} =~ s/^\s+//;
	$param->{spam} =~ s/\s+$//;

	if($param->{ham}){
		my $hfile = $plugin->envelope."/ham/".&makeTimestamp(time)."_log.cgi";
		open(FILE,">$hfile");
		print FILE $param->{ham}."\n";
		close(FILE);
	}

	if($param->{spam}){
		my $sfile = $plugin->envelope."/spam/".&makeTimestamp(time)."_log.cgi";
		open(FILE,">$sfile");
		print FILE $param->{spam}."\n";
		close(FILE);
	}

	my %ham = makeHashDir($plugin->envelope.'/ham');
	open(FILE,">:utf8",$plugin->envelope."/ham/_ham_log.cgi");
	foreach my $key(sort {$ham{$b} <=> $ham{$a}} keys %ham){
		print FILE join("\t",$key,$ham{$key},"\n");
	}
	close(FILE);

	my %spam = makeHashDir($plugin->envelope.'/spam');
	open(FILE,">:utf8",$plugin->envelope."/spam/_spam_log.cgi");
	foreach my $key(sort {$spam{$b} <=> $spam{$a}} keys %spam){
		print FILE join("\t",$key,$spam{$key},"\n");
	}
	close(FILE);

	my $ret = $plugin->SUPER::save_config($param, $scope);
	$ret;
}

sub makeHashDir{
	my $dir = shift;

	my %hash = ();
	opendir(DIR,$dir);
	while(my $file = readdir(DIR)){
		next if($file eq '.');
		next if($file eq '..');
		next if($file !~ /log\.cgi$/);
		next if($file =~ /^_/);

		$/ = undef;
		open(FILE,"<:utf8",$dir."/$file");
		my $text = <FILE>;
		close(FILE);
		$/ = "\n";

		my $str = &bsf_wakati::wakati($text);

		my @words = split(/ /,$str);
		foreach my $key (@words){
			next if(length($key) < 1);
			if(length($key) > 45){
				$key = substr($key,0,45);
			}
			$hash{$key}++;
			if($key =~ /(\.[a-z]{2,4})$/){
				$hash{"$1"}++;
			}
			if($key =~ /([a-z0-9\-]+\.[a-z]{2,4})$/){
				$hash{"$1"}++;
			}
		}
	}
	closedir(DIR);
	%hash;
}

sub getHash{
	my $file = shift;

	&debug_print("getHash:$file\n");

	my %hash = ();

	open(FILE,"<:utf8",$file);
	while(<FILE>){
		chomp;
		my ($key,$num) = split(/\t/);
		$hash{"$key"} = $num;
	}
	close(FILE);

	%hash;
}

sub makeHash{
	my $str = shift;

	my @ary = split(/ /,$str);
	my %hash=();
	foreach my $key (@ary){
		next if(length($key) < 1);
		if(length($key) > 45){
			$key = substr($key,0,45);
		}
		$hash{"$key"}++;
		if($key =~ /(\.[a-z]{2,4})$/){
			$hash{"$1"}++;
		}
		if($key =~ /([a-z0-9\-]+\.[a-z]{2,4})$/){
			$hash{"$1"}++;
		}
	}
	%hash;
}

# timestamp
sub makeTimestamp{
	my $time = shift;
	my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($time);
	sprintf("%04d-%02d-%02d-%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
}

sub debug_print{ 
	open(FILE,"+<:utf8",$plugin->envelope."/debug_log.cgi");
	flock(FILE,2);

	my $str = &makeTimestamp(time) . ' ' . Encode::decode_utf8($_[0]);

	my @dbglog = <FILE>; 
	push(@dbglog,$str);

	while(scalar(@dbglog) > 1000){ 
		shift(@dbglog); 
	}

	seek(FILE,0,0); 
	foreach(@dbglog){ 
		print FILE $_; 
	}

	truncate(FILE,tell(FILE)); 
	close(FILE);

#	 print encode('cp932',$str); 
}

1;
