#! /usr/local/bin/ruby -Ke
# encoding: euc-jp

##
## feml.rb: Bayesian Spam Filter
## Copyright (C) 2009 KOYAMA Hiro <tac@amris.co.jp>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software
## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
##

require 'optparse';
require 'kconv';
require 'iconv';
require 'fileutils';
require 'pstore';

require 'bigdecimal';
require 'bigdecimal/util';
##require 'bigdecimal/math';
##include BigMath;

require 'MeCab';

AmConf = Hash.new();

# =====================================================================
#
AmConf['User'] = 'tac';
AmConf['Base'] = '/usr/home/%user%';
AmConf['MailHome'] = AmConf['Base'] + '/Feml';
AmConf['ConfHome'] = AmConf['MailHome'] + '/conf';

# =====================================================================
#
AmConf['OccurenceStore'] = AmConf['ConfHome'] + '/feml_store.pstore';
AmConf['UfromWhiteList'] = AmConf['ConfHome'] + '/ufrom_white_list.txt';
AmConf['FromWhiteList'] = AmConf['ConfHome'] + '/from_white_list.txt';
AmConf['ReceivedWhiteList'] = AmConf['ConfHome'] + '/received_white_list.txt';

# =====================================================================
#
AmConf['LogFile'] = AmConf['MailHome'] + '/feml_log.txt';
AmConf['BackupMailDir'] = AmConf['MailHome'] + '/backup';
AmConf['CleanMailDir'] = AmConf['MailHome'] + '/clean';
AmConf['WhiteMailDir'] = AmConf['MailHome'] + '/white';
AmConf['DirtyMailDir'] = AmConf['MailHome'] + '/dirty';

# =====================================================================
#
##AmConf['AddHeaderIfClean'] = "X-Feml: clean";
##AmConf['AddHeaderIfWhite'] = "X-Feml: white";
##AmConf['AddHeaderIfDirty'] = "X-Feml: dirty";
		# ƤСȽ̤˱ƥإåɲäƽϤ롣

# =====================================================================
#
##AmConf['DontUpdateTokenDB'] = 'yes';
		# ǤʤʸʤСǡ١򹹿ʤ

# =====================================================================
#
AmConf['MailBoxFile'] = '/var/mail/%user%';
		# ѥȽꤷʤäȤΥեɵ롣
		# â줬ǤС$stdout˽񤭽Ф

AmConf['LockFile'] = '/tmp/FEML_LOCK_FILE.txt';

# =====================================================================
#
AmConf['SpamThreshold'] = 0.95;

# ---------------------------------------------------------------------
# ---------------------------------------------------------------------
#
FemlVersion = 'feml-0.2.0';

# =====================================================================
# ѥΨȽ
#
class SpamProb

	# =============================================================
	#
	def initialize(source = '', log_fp = nil)
		@source = (source || '').toeuc;
		@log_fp = log_fp || $stderr;
	end

	# =============================================================
	# ФоݤȤʤʬ (ȤHTML) 롣
	#
	def remove_tags(src)
		tags = "html|head|body";
		tags += "|frameset|frame|noframes";
		tags += "|title|script|meta|link|style";
		tags += "|br|p|div|span|hr";
		tags += "|pre|blockquote|q|address";
		tags += "|h1|h2|h3|h4|h5|h6";
		tags += "|strong|em|dfn|code|samp|kbd|var|cite|abbr|acronym";
		tags += "|sup|sub";
		tags += "|a|img";
		tags += "|table|caption|thead|tfoot|tbody|tr|th|td";
		tags += "|ul|ol|li|dl|dt|dd";
		tags += "|applet|object";
		tags += "|form|input|textarea|select|option|label|button";
		tags += "|tt|b|u|i|s|strike|big|small|blink|marquee";
		tags += "|font|basefont|center|map|area|iframe"
		patterns = [
			## %r|<style.+?</style>|im,
			%r|<!DOCTYPE\s.+?>|im,
			%r|<(#{tags})\s.*?>|im,
			%r|<(#{tags})>|im,
			%r|</(#{tags})>|im,
			%r|<(#{tags})/>|im,
			%r|<\?>|im,
		];
		patterns.each do |pattern|
			src = src.gsub(pattern, ' ');
		end
		return src;
	end

	# =============================================================
	# SPAM/SPAMΨ׻ɸ롣
	#
	def probability()
		tokenizer = Tokenizer.new(remove_tags(@source));
		prob_array = tokenizer.get_spam_probability_array;
			# [ [ < w>, <Ψ p(w)>, <DBи n(w)> ], ... ]
		return 0 if prob_array.size == 0;

		# -----------------------------------------------------
		# robx: νʣñSPAMΨʿѡ
		# ˽иƤʤФ롢ͽ¬ΨͤȤƻȤ
		#
		num = 0;
		sum = 0.0;
		prob_array.uniq.each do |token, prob, db_occur|
			num += 1;
			sum += prob;
		end
		avg_prob = sum / num;

		# -----------------------------------------------------
		# () ƻ token ñSPAMΨDBи
		#	token_stat_hash[token].raw_prob : ñSPAMΨ
		#	token_stat_hash[token].db_occur : DBи
		#	token_stat_hash[token].mod_prob : SPAMΨ
		# ȤĴ٤褦ˤƤ
		#
		token_stat = Struct.new(:raw_prob, :db_occur, :mod_prob);
		token_stat_hash = Hash.new();
		prob_array.uniq.each do |token, raw_prob, db_occur|
			token_stat_hash[token] =
				token_stat.new(raw_prob, db_occur, 0);
		end

		# -----------------------------------------------------
		# SPAMΨ token_stat_hash[token].mod_prob ˵롣
		# f(w) = ((robs * robx) + (n * p(w))) / (robs + n)
		#	where:
		#		robs = 0.001 ()
		#			robx ͽ¬Ϳ붯 (strength)
		#		n = DBи
		#
		robs = 0.001;
		robx = avg_prob;

		# (ˡ) ͽ¬Ψ = 0.5ζ = 1.0
		#
		robs = 1.0;
		robx = 0.5;

		token_stat_hash.each_pair do |token, st|
			pw = st.raw_prob;
			n = st.db_occur;
			fw = ((robs * robx) + (pw * n)) / (robs + n);
			token_stat_hash[token].mod_prob = fw;
		end

		# -----------------------------------------------------
		#
		@log_fp.print "robx = #{robx} : ͽ¬Ψ\n";
		@log_fp.print "robs = #{robs} : ͽ¬Ψζ\n";
		@log_fp.print "** ñSPAMΨ:DBи:SPAMΨ:\n";
		token_stat_hash.each_pair do |token, st|
			if st.db_occur <= 0
				@log_fp.printf("---------- :       0 : %9.5f%% : %s\n",
					st.mod_prob * 100, token);
			else
				@log_fp.printf("%9.5f%% : %7d : %9.5f%% : %s\n",
					st.raw_prob * 100, st.db_occur,
					st.mod_prob * 100, token);
			end
		end

		# -----------------------------------------------------
		# p = ((1 - f(w1)) * ... * (1 - f(wm)))
		# q = (f(w1) * ... * f(wn))
		# lp = log(p)
		# lq = log(q)
		#
		# ľܷ׻Float::EPSILON⾮ͤФƤޤΤǡ
		# Τ褦ѷƷ׻롣
		# q = f(w) = exp(log(f(w)))
		# lq = log(q) = log(f(w))
		# (Ʊͤ)
		# lp = log(1 - f(w))
		#
		lp = 0;
		lq = 0;
		prob_array.each do |token, raw_prob, db_occur|
			fw = token_stat_hash[token].mod_prob;
			lp += Math::log(1 - fw);
			lq += Math::log(fw);
		end
		n = prob_array.size;
		@log_fp.print "[A] lp = log(1 - fw) = #{lp}, lq = log(fw) = #{lq}\n";
		@log_fp.print "[A] p = (1 - fw) = #{Math::exp(lp)}, q = fw = #{Math::exp(lq)}\n";
		@log_fp.print "[A] n = #{n} : θĿ\n";

		# -----------------------------------------------------
		# pp = 1 - chi-square(-2 * log(p), 2 * n) ; SPAMΨ
		# qq = 1 - chi-square(-2 * log(q), 2 * n) ; SPAMΨ
		# s = (1 + pp - qq) / 2                   ; ɸ
		#
		pp = 1 - chi2q(-2.0 * lp, 2 * n);
		qq = 1 - chi2q(-2.0 * lq, 2 * n);
		@log_fp.print "[A] pp = 1 - 2(-2 * lp, 2 * n) = #{pp}\n";
		@log_fp.print "[A] qq = 1 - 2(-2 * lq, 2 * n) = #{qq}\n";
		s = (1.0 + pp - qq) / 2.0;
		@log_fp.print "[A] s = (1.0 + pp - qq) / 2.0 = #{s}\n";

		# =====================================================
		# Ƚ̤˽äƻǡ١ư롣
		#
		if s < AmConf['SpamThreshold'].to_f
			tokenizer.update_occurences(:add_clean);
		else
			tokenizer.update_occurences(:add_spam);
		end

		return s;
	end

	# --------------------------------------------------------------
	# ^2
	#	m = x2 / 2 Ȥơ
	#	s = exp(-m) * (m^i / i!)
	#		ϡi = 0 .. n / 2 - 1
	#	FloatɽϰϤĶ礬Τ BigDecimal Ƿ׻
	#
	def chi2q(x2, n)
		m = x2 / 2.0;
		sig = 63;				# 
		t = BigDecimal::new("1.0");
		s = BigDecimal::new("1.0");
		(1 .. (n / 2) - 1).each do |i|
			## t *= (m / i);
			## s += t;
			t = t.mult((m / i).to_d, sig);
			s = s.add(t, sig);
##			@log_fp.print "[#{i}] t = #{t}, s = #{s}\n";
		end
##		@log_fp.print "[chi2q] (m^i / i!) = #{s}\n";
##		s *= Math::exp(-m);
			# Math::exp(-m)  m 礭 (m > 745) Ȥ
			# ե (0 ˤʤ) Τǡ
			# e  m_int_part 껻θ
			# exp(m_frac_part) ǳäƾʬĴ롣
			#
			# BigMath::exp(-m.to_d, sig) ϡ
			# m > 50 ˤʤ̤ʤ (Bug?)

		m_int_part = m.truncate;		# ʬ
		m_frac_part = m - m_int_part;		# ʬ
		e = Math::E.to_d;
		m_int_part.times do |j|
			s = s.div(e, sig);
		end
		s = s.div(Math::exp(m_frac_part).to_d, sig);
##		@log_fp.print "[chi2q] chi2q(x2 = #{x2},  = #{n}) => #{s}\n";

		if s.nan?
##			@log_fp.print "[chi2q] s is NaN\n";
			return 1.0
		else
			s = s.to_f;
				# s  (Floatǥե) ΤȤ
				# s.to_f  Float::MIN () ˤʤä
				# ޤȤա
			return [0, [s, 1.0].min].max;
		end
	end

end

# =====================================================================
# Żҥ᡼롦ƥȤΥإåϡǥɡ
#
class ExDecoder

	# =============================================================
	#
	def initialize()
		@mbuf_array = [];
		@mbuf_header_array = [];
		@mbuf_body = '';
	end

	# =============================================================
	# Żҥ᡼ΥƥȤե뤫ɤ߹ࡣ
	#
	def load(file_name)
		fp = open(file_name, "r");
		load_fp(fp);
		fp.close;
	end

	# =============================================================
	# Żҥ᡼ΥƥȤե뤫ɤ߹ࡣ
	#
	def load_fp(fp)
		@mbuf_array = fp.readlines();
		parse_header();
		parse_body();
	end

	# =============================================================
	# Żҥ᡼ΥƥȤʸȤɤ߹ࡣ
	#
	def load_array(buf_array)
		@mbuf_array = buf_array;
		parse_header();
		parse_body();
	end

	# =============================================================
	# Żҥ᡼ΥƥȤ򤽤Τޤޥե˽񤭽Ф
	#
	def store(file_name)
		fp = open(file_name, "w");
		store_fp(fp);
		fp.close;
	end

	# =============================================================
	# Żҥ᡼ΥƥȤ򤽤Τޤޥե˽񤭽Ф
	#
	def store_fp(fp)
		@mbuf_array.each do |buf|
			fp.print buf;
		end
	end

	# =============================================================
	# إåŪɲä롣
	#  parse_header() ʬϺѤߤΥإåˤϱƶͿʤ
	#
	def add_header(header)
		header_body_delim_i = nil;
		@mbuf_array.each_index  do |i|
			mbuf = @mbuf_array[i];
			if mbuf == nil || mbuf == "\n"		## 
				header_body_delim_i = i;
				break;
			end
		end
		if header_body_delim_i != nil
			@mbuf_array.insert(header_body_delim_i,
					header.chomp + "\n");
		end
	end

	# =============================================================
	# ISO-8859-1 (Latin-1)  windows-1252  C1 ΰ˴ޤޤʸ
	# ŬASCIIʸ֤롣
	#
	# charset=iso-8859-1פȤʤäƤƤ⡢ºݤˤwindows-1252
	# 沽Ƥ礬Τǡwindows-1252ǲ᤹롣
	#
	def windows_1252_to_ascii(src)
		m_hash = {
			0x80 => 'Euro',		# euro sign
			0x81 => ' ',		# [UNDEFINED]
			0x82 => ',',		# single low-9 quote
			0x83 => 'f',		# f with hook
			0x84 => '"',		# double low-9 quote
			0x85 => '-',		# midline horizontal ellipsis
			0x86 => '$',		# dagger
			0x87 => '$',		# double dagger
			0x88 => '^',		# circonflex
			0x89 => '%',		# permille
			0x8A => 'S',		# S caron
			0x8B => '<',		# single left angle quote
			0x8C => 'OE',		# OE ligature
			0x8D => ' ',		# [UNDEFINED]
			0x8E => 'Z',		# Z caron
			0x8F => ' ',		# [UNDEFINED]
			0x90 => ' ',		# [UNDEFINED]
			0x91 => '\'',		# left single quotation
			0x92 => '\'',		# right single quotation
			0x93 => '"',		# left double quotation
			0x94 => '"',		# right double quotation
			0x95 => '-',		# bullet
			0x96 => '-',		# en-dash
			0x97 => '-',		# em-dash
			0x98 => '~',		# small tilde
			0x99 => '(TM)',		# tm: trademark
			0x9A => 's',		# s caron
			0x9B => '>',		# single right angle quote
			0x9C => 'oe',		# oe ligature
			0x9D => ' ',		# [UNDEFINED]
			0x9E => 'z',		# z caron
			0x9F => 'Y',		# Y diaeresis

			0xA0 => ' ',		# &nbsp;
			0xA1 => '!',		# &iexcl;
			0xA2 => '$',		# &cent;
			0xA3 => '$',		# &pound;
			0xA4 => '$',		# &curren;
			0xA5 => '\\',		# &yen;
			0xA6 => '|',		# &brvbar;
			0xA7 => '$',		# &sect;
			0xA8 => '~',		# &uml;
			0xA9 => '(C)',		# &copy;
			0xAA => 'a',		# &ordf;
			0xAB => '<',		# &laquo;
			0xAC => '~',		# &not;
			0xAD => '-',		# &shy; (soft hyphon)
			0xAE => '(R)',		# &reg;
			0xAF => '~',		# &macr;
			0xB0 => '$',		# &deg;
			0xB1 => '$',		# &plusnm;
			0xB2 => '2',		# &sup2;
			0xB3 => '3',		# &sup3;
			0xB4 => '\'',		# &acute;
			0xB5 => '$',		# &micro;
			0xB6 => '$',		# &para;
			0xB7 => '-',		# &middot;
			0xB8 => ',',		# &cedil;
			0xB9 => '1',		# &sup1;
			0xBA => '0',		# &ordm;
			0xBB => '>',		# &raquo;
			0xBC => '1/4',		# &frac14;
			0xBD => '1/2',		# &frac12;
			0xBE => '3/4',		# &frac34;
			0xBF => '?',		# &iquest;
			0xC0 => 'A',		# &Agrave;
			0xC1 => 'A',		# &Aacute;
			0xC2 => 'A',		# &Acirc;
			0xC3 => 'A',		# &Atilde;
			0xC4 => 'A',		# &Auml;
			0xC5 => 'A',		# &Aring;
			0xC6 => 'A',		# &AElig;
			0xC7 => 'C',		# &Ccedil;
			0xC8 => 'E',		# &Egrave;
			0xC9 => 'E',		# &Eacute;
			0xCA => 'E',		# &Ecirc;
			0xCB => 'E',		# &Euml;
			0xCC => 'I',		# &Igrave;
			0xCD => 'I',		# &Iacute;
			0xCE => 'I',		# &Icirc;
			0xCF => 'I',		# &Iuml;
			0xD0 => 'D',		# &ETH;
			0xD1 => 'N',		# &Ntilde;
			0xD2 => 'O',		# &Ograve;
			0xD3 => 'O',		# &Oacute;
			0xD4 => 'O',		# &Ocirc;
			0xD5 => 'O',		# &Otilde;
			0xD6 => 'O',		# &Ouml;
			0xD7 => 'x',		# &times;
			0xD8 => '0',		# &Oslash;
			0xD9 => 'U',		# &Ugrave;
			0xDA => 'U',		# &Uacute;
			0xDB => 'U',		# &Ucirc;
			0xDC => 'U',		# &Uuml;
			0xDD => 'Y',		# &Yacute;
			0xDE => '$',		# &THORN;
			0xDF => 'ss',		# &szlig;
			0xE0 => 'a',		# &agrave;
			0xE1 => 'a',		# &aacute;
			0xE2 => 'a',		# &acirc;
			0xE3 => 'a',		# &atilde;
			0xE4 => 'a',		# &auml;
			0xE5 => 'a',		# &aring;
			0xE6 => 'a',		# &aelig;
			0xE7 => 'c',		# &ccedil;
			0xE8 => 'e',		# &egrave;
			0xE9 => 'e',		# &eacute;
			0xEA => 'e',		# &ecirc;
			0xEB => 'e',		# &euml;
			0xEC => 'i',		# &igrave;
			0xED => 'i',		# &iacute;
			0xEE => 'i',		# &icirc;
			0xEF => 'i',		# &iuml;
			0xF0 => 'o',		# &eth;
			0xF1 => 'n',		# &ntilde;
			0xF2 => 'o',		# &ograve;
			0xF3 => 'o',		# &oacute;
			0xF4 => 'o',		# &ocirc;
			0xF5 => 'o',		# &otilde;
			0xF6 => 'o',		# &ouml;
			0xF7 => '/',		# &divide;
			0xF8 => 'o',		# &oslash;
			0xF9 => 'u',		# &ugrave;
			0xFA => 'u',		# &uacute;
			0xFB => 'u',		# &ucirc;
			0xFC => 'u',		# &uuml;
			0xFD => 'y',		# &yacute;
			0xFE => '$',		# &thorn;
			0xFF => 'y',		# &yuml;
		};

		src.force_encoding(Encoding::BINARY);
		result = src.split(//n).collect { |ch|
			x = m_hash[ch.ord] || ch;
			x;
		}.join('');
		return result;
	end

	# =============================================================
	# Iconvˤꥨ󥳡ǥ󥰤Ѵ
	#
	def decode_by_iconv(src, to_enc, from_enc, repl = '<?>')

		result = '';
		s = src.clone;
		while ! s.empty?
			begin
				result += Iconv.conv(to_enc, from_enc, s);
				s = '';
			rescue Iconv::Failure => e
					# Iconv::IllegalSequence
					# Iconv::InvalidCharacter
				result += e.success;
						# ѴȤޤǡ
				result += repl;
				s = e.failed[0][1 .. -1] || '';
						# ĤʬΤƬ1ХȤ
						# ƺѴߤ롣
			end
		end
		return result;
	end

	# =============================================================
	# ʸɤǤѴEUC-JPʸˤ롣
	#
	def buf_decode(mbuf, charset = nil)

		case (charset || '').downcase

		when 'utf-7', 'unicode-1-1-utf-7'
			mbuf = decode_by_iconv(mbuf, 'utf-8', 'utf-7').toeuc;
				#
				# "<!" UTF-7ɽǤ s = "+/v8APAAh-" 
				#	Iconv.conv('EUC-JP', 'utf-7', s)
				# ǤѴǤʤ (IllegalSequence ȯ)
				# libiconv꤫?
				#
				#	src = "/v8APAAh";
				#	a = src.unpack('m*')[0];
				#	print "#{a.inspect}\n";
				#		# -> "\xFE\xFF\x00<\x00!"
				# ʤΤǡBOM (Big Endian) Ĥ UTF-16 
				# 뤳Ȥʬ롣
				#
				# utf-8ͳѴǤ롣
				#

		when 'iso-2022-jp-2'
			from_enc = 'ISO-2022-JP-2';
			mbuf = decode_by_iconv(mbuf, 'EUC-JP', from_enc);

		when 'gb2312'				# λ
			from_enc = Encoding::GBK;		# gb2312γĥ
			mbuf = mbuf.encode(Encoding::EUC_JP, from_enc,
				:invalid => :replace,
				:undef => :replace,
				:replace => '<?>');

		# 'windows-936'				# λGBK
		# 'gb18030'				# GBKξ̽??

		# 'windows-949'				# Korean
		# 'windows-950'				# λBig5

		when 'iso-8859-5', 'windows-1251',	# Cyrillic
		     'koi8',				# Cyrillic
		     'koi8-r',				# /֥륬ꥢ
		     'koi8-u'				# 饤ʸ
						# koi8-ru: ??
						# koi8-t: 
						# koi8-cs: /Х
				#  ҧާ֧ߧ ߧާѧڧ֧
			from_enc = 'windows-1251';
			mbuf = decode_by_iconv(mbuf, 'EUC-JP', from_enc);
							# IconvǤʤ
							# ޤѴǤʤ

		when 'iso-8859-1', 'windows-1252',	# Latin-1 ()
		     'iso-8859-2', 'windows-1250',	# Latin-2 (沤)
		     'iso-8859-3',			# Latin-3 ()
		     'iso-8859-4',			# Latin-4 (̲)
		     'iso-8859-9', 'windows-1254',	# Latin-5 (ȥ륳)
		     'iso-8859-10',			# Latin-6 (̥ޥ)
		     'iso-8859-13',			# Latin-7 (Хȸ)
		     'iso-8859-14', 'windows-1257',	# Latin-8 (ȸ)
		     'iso-8859-15',			# Latin-9 (8859-1)
		     'iso-8859-16'			# Latin-10 (첤)
					# ºݤˤϤ줾ʸγƤ
					# ۤʤ뤬
					# ѴΤŪǤ
					# ʤΤǡwindows-1252Ѥ롣
			mbuf = windows_1252_to_ascii(mbuf);

		# 'iso-8859-6', 'windows-1256'		# ӥ
		# 'iso-8859-7', 'windows-1253'		# ꥷ
		# 'iso-8859-8', 'windows-1255'		# إ֥饤
		# 'iso-8859-11', 'windows-874'		# 
		# 'windows-1258'			# ȥʥ
		# 'iso-8859-12'			# ǡʡ꡼(˴)

		# else
		# 'us-ascii'				#
		# 'windows-932'				# Japanese Shift-JIS
		# 'shift_jis'				#
		# 'shift-jis'				#
		# 'sjis'				#
		# 'euc-jp'				#
		# 'iso-2022-jp'				#
		# 'utf-8'				#
						# ʾNKFѴ
		end

		# -----------------------------------------------------
		# charsetƤʤޤᡢNKFǥɤ
		# ¬Ѵ롣
		#	charset=iso-2022-jpפȵҤƤʤ
		#	ºݤˤShift-JISǤ褦ʤΤ⤢Τǡ
		#	Shift-JIS | EUC-JP | ISO-2022-JP ˴ؤƤ
		#	charset˰¸ʤ
		#
		mbuf = NKF::nkf('-e -I -m0 --numchar-input', mbuf);
			# ʳ
			#	Shift-JIS | iso-2022-jp | utf-8
			#	&#nnnnn;
			# ǥɤ롣
			# -e: Output code is EUC
			# -I: Convert non ISO-2022-JP character to GETA
			# -m0: No MIME decode
			#	nkfǤMIME󥳡ɤ줿ʬľˤ
			#	Ԥ褦ʤΤǡ򤱤롣
			#	̾ʻͤǤ뤬ɤΤ褦
			#	󥳡ɤƤ뤫ѻŪˤŬڡ
			#	nkf̤бcharsetΤᡢˤƤ
			#	MIMEǥɤνɬפˤʤ롣
			# --numchar-input: Convert Unicode Character Reference
			#

		# -----------------------------------------------------
		# [MIME-B] (base64)[MIME-Q] (quoted-printable) Υǥɡ
		#
		mbuf = mbuf.gsub(/=\?(.+?)\?B\?([!->@-~]+?)\?=/i) do
			m_charset = $1 || '';
			text = $2;
			s = buf_decode(text.unpack("m")[0], m_charset);
			s;
		end
		mbuf = mbuf.gsub(/=\?(.+?)\?Q\?(.+?)\?=/im) do
			m_charset = $1 || '';
			text = $2;
			s = buf_decode(text.unpack("M")[0], m_charset);
			s;
		end

		return mbuf;
	end

	# -------------------------------------------------------------
	# إåʬϤ롣
	#
	def parse_header()
		# -----------------------------------------------------
		# (إåʸζڤ)ޤǤФ롣
		# - ³ԤԤϢ뤷Ƥ
		# - ǥɤEUC-JPɸಽ롣
		#
		header_body_delim_i = nil;
		@mbuf_array.each_index  do |i|
			mbuf = @mbuf_array[i];
			if mbuf == nil || mbuf == "\n"		## 
				header_body_delim_i = i;
				break;
			end
			if /^\s/ !~ mbuf.toeuc
				@mbuf_header_array << mbuf;
			else				## ³
				@mbuf_header_array[@mbuf_header_array.size - 1] += mbuf;
			end
		end
		@mbuf_header_array.each_index do |i|
			mbuf = @mbuf_header_array[i];
			m_charset = 'iso-8859-1';
			if /^(From|Subject):/ =~ mbuf.toeuc
				m_charset = nil;
			end
			@mbuf_header_array[i] = buf_decode(mbuf, m_charset);
		end
				# MIME󥳡ɤʤɤ³Ԥˤޤ³Ƥ
				# ȤΤǡϢ뤷ǥǥɤ롣
				# إåΥɤiso-8859-1§
				# Shift-JISΤޤޤƤΤ
				# ¿ΤǡSubject/FromԤˤĤƤ
				# 㳰Ȥư
				#

		if header_body_delim_i == nil		# ԤߤĤʤ
			@mbuf_body_array = [];
		else
			@mbuf_body_array = @mbuf_array[header_body_delim_i + 1 .. -1];
		end

		# -----------------------------------------------------
		# إåԤɬפʾФƤ
		# - 󥳡ǥ󥰡
		# - Content-Type: multipartåζڤԡ
		#
		@content_type_major = 'text';
		@content_type_minor = 'plain';
		@charset = '';
		@boundary = '';
		@mbuf_header_array.each do |mbuf|
			if /^Content-Transfer-Encoding:\s*(.+)$/i =~ mbuf
				@encoding = $1;
			elsif /^Content-Type:\s*(.+?)\/(.+?);(.*)?/im =~ mbuf
				@content_type_major = $1.downcase;
				@content_type_minor = $2.downcase;
				rest = $3;
				if /charset\s*=\s*([\'\"]*)([^\s\1\;]+)\1/i =~ rest
					@charset = $2;
				end
				if /boundary\s*=\s*"(.+?)"/i =~ rest
					@boundary = $1;
				elsif /boundary\s*=\s*([^\s;]+)/i =~ rest
					@boundary = $1;
				end
			elsif /^Content-Type:\s*(\w+?)\/(\w+)/im =~ mbuf
				@content_type_major = $1.downcase;
				@content_type_minor = $2.downcase;
			end
		end

		@boundary_lines = Array.new();
		if @boundary != nil && @boundary != ''
			b_regex = Regexp.new("--#{Regexp.escape(@boundary)}(--)?$");
			@mbuf_body_array.each_index do |i|
				mbuf = @mbuf_body_array[i].toeuc;
				if b_regex =~ mbuf
					@boundary_lines << i;
				end
			end
		end

##		print "[encoding] #{@encoding}\n";
##		print "[content_type_major] #{@content_type_major}\n";
##		print "[content_type_minor] #{@content_type_minor}\n";
##		print "[boundary] #{@boundary}\n";
##		print "[charset] #{@charset}\n";
##		print "[boundary_lines] #{@boundary_lines.join(', ')}\n";

	end
	private :parse_header;

	# -------------------------------------------------------------
	# إåλϤˤĤ:
	#
	# multipart-boundaryιʸ: rfc2046.txt
	#
	#	boundary := 0*69<bchars> bcharsnospace
	#
	#	bchars := bcharsnospace / " "
	#
	#	bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" /
	#			"+" / "_" / "," / "-" / "." /
	#			"/" / ":" / "=" / "?"
	#
	# rfc2045.txt
	#
	#	parameter := attribute "=" value
	#
	#	attribute := token
	#			; Matching of attributes
	#			; is ALWAYS case-insensitive.
	#
	#	value := token / quoted-string
	#
	#	token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
	#
	#	tspecials :=  "(" / ")" / "<" / ">" / "@" /
	#			"," / ";" / ":" / "\" / <">
	#			"/" / "[" / "]" / "?" / "="
	#			; Must be in quoted-string,
	#			; to use within parameter values
	#
	# äơǰϤޤ˵ҤǤboundaryʸ:
	#		DIGIT / ALPHA / "'" / "+" / "_" / "-" / "."
	#

	# -------------------------------------------------------------
	#
	def each_multipart()
		(0 .. @boundary_lines.size - 2).each do |i|
			beg_index = @boundary_lines[i] + 1;
			end_index = @boundary_lines[i + 1] - 1;
			yield(@mbuf_body_array[beg_index .. end_index]);
		end
	end
	protected :each_multipart;

	# -------------------------------------------------------------
	# ʸϤ롣
	# - multipartξ硢Content-TypetextǤʤ󥯤Ϻ
	# - base64quoted-printableǥ󥳡ɤƤʬϥǥɡ
	# - ʸ󥳡ǥ󥰤ˤ餺EUC-JPѴ
	#
	def parse_body()
		@mbuf_body = '';
		tbuf = '';

		top_line = (@mbuf_body_array[0] || '').toeuc;
		if /^MAIL FROM:/ =~ top_line
			sub_ex_d = ExDecoder.new();
			sub_ex_d.load_array(@mbuf_body_array);
			@mbuf_body += 'X-From: ';
			@mbuf_body += sub_ex_d.get_header('From:');
			@mbuf_body += 'X-Subject: ';
			@mbuf_body += sub_ex_d.get_header('Subject:');
			@mbuf_body += "\n";
			@mbuf_body += sub_ex_d.get_decoded_body;
				#
				# (ʷ)
				# ʸˡMAIL FROM:פϤޤ
				# SMTPȤΤȤ꤬äƤ롣
				# ʷǤϤ뤬Τ褦ʥѥब
				# ¿Τǡ̰ƲϤ롣
				#

		elsif @boundary_lines.size != 0
			self.each_multipart do |mbuf_array|
##				@mbuf_body += "--#{@boundary}\n";
				@mbuf_body += "----------\n";
				sub_ex_d = ExDecoder.new();
				sub_ex_d.load_array(mbuf_array);
				@mbuf_body += sub_ex_d.get_decoded_body;
			end
##			@mbuf_body += "--#{@boundary}--\n";
			@mbuf_body += "// ----------\n";

		elsif @content_type_major == "text" && @encoding == "base64"
			@mbuf_body_array.each do |buf|
				tbuf += buf.unpack('m*')[0];
			end
		elsif @content_type_major == "text" && @encoding == "quoted-printable"
			@mbuf_body_array.each do |buf|
				tbuf += buf.unpack('M*')[0];
			end
		elsif @content_type_major == "text"
			@mbuf_body_array.each do |buf|
				tbuf += buf;
			end

		elsif @content_type_major == "message" &&
		      @content_type_minor == "rfc822"
			sub_ex_d = ExDecoder.new();
			sub_ex_d.load_array(@mbuf_body_array);
			@mbuf_body += "====================\n";
			@mbuf_body += sub_ex_d.get_decoded_body;
			@mbuf_body += "// ====================\n";

		elsif @content_type_major == "message"
			@mbuf_body_array.each do |buf|
				tbuf += buf;
			end

		else
			### Ϥʤ
		end

		@mbuf_body += buf_decode(tbuf, @charset) if tbuf != '';

		return @mbuf_body;
	end
	protected :parse_body;

	# =============================================================
	# إåͤ롣
	# :
	#	s = dec.get_header('Subject:')	# => (Subject:׹Ԥ)
	# إåʤnil
	#
	def get_header(key)
		@mbuf_header_array.each do |mbuf|
			if /^#{key}\s*(.+)$/im =~ mbuf
				value = $1.gsub(/\n\s/, '').strip + "\n";
				return value;
			end
		end
		return nil;
	end

	# =============================================================
	# Żҥ᡼ʸ(ǥɺѤ)֤
	#
	def get_decoded_body()
		return @mbuf_body;
	end

	# =============================================================
	# Żҥ᡼Υƥ(ǥɺѤ)֤
	#
	def decoded_message()
		result = '';
		@mbuf_header_array.each do |buf|
			result += buf;
		end
		result += "\n";
		result += @mbuf_body;

		return result;
	end

	# =============================================================
	#
	def decode_out(ofp)
		ofp.print decoded_message();
	end

end

# =====================================================================
# ؤʬϡиδ
#
class Tokenizer

	# =============================================================
	#
	def initialize(source = '')
		@source = (source || '').toeuc;
	end

	# =============================================================
	#
	def not_a_token(s)
		patterns = [
			/^[!-@\[-`\{-~--]+$/,
				## !"#$%&'()*+,-./0123456789:;<=>?@
				## [\]^_`
				## {|}~
				## »ֳ֡礭ʴ () 2121227E
				##  28212840
		];
		patterns.each do |pattern|
			if pattern =~ s
				return true;
			end
		end
		return false;
	end

	# =============================================================
	#
	def get_tokens()
		m = MeCab::Tagger.new("-Ochasen")
		token_array = Array.new();
		node = m.parseToNode(@source);
		while node
##			print "[surface] #{node.surface}\n";
##			print "[feature] #{node.feature}\n";
			token = node.surface;			# ɽ
			token.force_encoding(Encoding::EUC_JP);
			word_class = node.feature.split(/,/)[0];
			word_class.force_encoding(Encoding::EUC_JP);

			is_eligible_token = false
			if word_class == '̾'
				if 2 <= token.length ||
				   token =~ /^[--]+$/
					is_eligible_token = true
				end
			else
				if 2 <= token.gsub(/[^--]/, '').length
					is_eligible_token = true
				end
			end
			is_eligible_token = false if not_a_token(token);

			if is_eligible_token
				token_array << token;
			end
			node = node.next
		end
		return token_array;
	end

	# =============================================================
	# ʬ䤷ƻΥѥΨ롣
	# : [ [ <>, <Ψ>, <и> ], ... ]
	#
	def get_spam_probability_array()
		token_array = self.get_tokens();
		result = Array.new();

		db = PStore.new(AmConf['OccurenceStore']);
		db.transaction do
			token_array.each do |token|
				occurence_array = db[token] || [0, 0];
				c = occurence_array[0];
				s = occurence_array[1];
				prob = (c + s != 0) ? s.to_f / (c + s) : 0.0;
				result << [ token, prob, c + s ];
			end
		end
		return result;
	end

	# =============================================================
	# νи롣
	# mode := :add_clean | :add_spam | :sub_clean | :sub_spam |
	#         :add_spam_sub_clean | :add_clean_sub_spam
	#
	def update_occurences(mode = :add_clean)
		dont_update = AmConf['DontUpdateTokenDB'];
		return if dont_update != nil && ! dont_update.empty?;

		db = PStore.new(AmConf['OccurenceStore']);
		db.transaction do
			self.get_tokens.each do |token|
				occurence_array = db[token] || [0, 0];
					# [ <cleanи>, <spamи> ]
				case mode.to_s
				when 'add_clean'
					occurence_array[0] += 1;
				when 'add_spam'
					occurence_array[1] += 1;
				when 'sub_clean'
					occurence_array[0] =
					    [occurence_array[0] - 1, 0].max;
				when 'sub_spam'
					occurence_array[1] =
					    [occurence_array[1] - 1, 0].max;
				when 'add_spam_sub_clean'
					occurence_array[0] =
					    [occurence_array[0] - 1, 0].max;
					occurence_array[1] += 1;
				when 'add_clean_sub_spam'
					occurence_array[0] += 1;
					occurence_array[1] =
					    [occurence_array[1] - 1, 0].max;
				end
				db[token] = occurence_array;
			end
		end
	end

	# =============================================================
	# νиǡ١롣
	#
	def self.clear_probability()
		if File::exist?(AmConf['OccurenceStore'])
			File::unlink(AmConf['OccurenceStore']);
		end
	end

	# =============================================================
	# ǡ١Υݡȡ
	#
	def self.import(ifp = $stdin)
		dir = File::dirname(AmConf['OccurenceStore']);
		if ! File::directory?(dir)
			FileUtils::mkdir_p(dir);
		end
		db = PStore.new(AmConf['OccurenceStore']);
		db.transaction do
			ifp.each_line do |buf|
				buf = buf.toeuc;
				next if /^#/ =~ buf;
				next if /^$/ =~ buf;
				if /(\d+)[ :]+(\d+)[ :]+([\d\.]+)[ :%]+(.+)/ =~ buf
					c = $1.to_i;
					s = $2.to_i;
					token = $4;
					db[token] = [c, s];
				end
			end
		end
	end

	# =============================================================
	# ǡ١Υݡȡ
	#
	def self.export(ofp = $stdout)
		db = PStore.new(AmConf['OccurenceStore']);
		db.transaction do
			db.roots.sort.each do |token|
				occurence_array = db[token];
				c = occurence_array[0];
				s = occurence_array[1];
				if c + s != 0
					prob = s.to_f / (c + s);
				else
					prob = 0.5;
				end
				ofp.printf("%7d : %7d : %9.5f%% : %s\n",
					c, s, prob * 100, token);
			end
		end
	end

	# =============================================================
	#
	def self.spam_probability(token)
		prob = 0;
		db = PStore.new(AmConf['OccurenceStore']);
		db.transaction do
			occurence_array = db[token] || [0, 0];
			c = occurence_array[0];
			s = occurence_array[1];
			if c + s != 0
				prob = s.to_f / (c + s);
			end
		end
		return prob;
	end

end

# =====================================================================
# ᡼롦ɥ쥹ʸȤƤǧ롣
#
def mail_addr_verify(mail_addr)
	local_part_chars = '[0-9A-Za-z_!#\$%&\'*+\-\/=\?^_{|}\~\.]+';
	domain_part_chars = '[0-9A-Za-z_\-\.]+';
		# \wפȽ񤯤ȤʸޤޤΤǡ
		# 0-9A-Za-z_ ȳƵҡ
	valid_pattern = Regexp::new("^(#{local_part_chars})@(#{domain_part_chars})$".force_encoding('ASCII-8BIT'));
	return (valid_pattern =~ mail_addr) ? true : false;
end

# ================================================================
# From:׹Ԥ᡼롦ɥ쥹Ф롣
#
def extract_mail_addr(buf)
	if /<(.+)>/ =~ buf
		mail_addr = $1;
	elsif /([^ ]+)\s*\(.*\)/ =~ buf
		mail_addr = $1;
	else
		mail_addr = buf.strip;
	end
	if mail_addr_verify(mail_addr) == false
		mail_addr = '';
	end
	return mail_addr;
end

# ================================================================
# ۥ磻ȡꥹ (ե˵) ˹פƤ뤫ݤθ
# target_text: оʸ (̤ˤإåʸ)
# white_list_class: ե AmConf[white_list_class] ˵Ҥ줿
#		ۥ磻ȡꥹȤtarget_textȤ
#
def is_in_white_list_pattern(target_text, white_list_class)
	white_list_file = AmConf[white_list_class];

	return false if white_list_file == nil;
	return false if ! File.file?(white_list_file);

	File::foreach(white_list_file) do |buf|
		buf = buf.chomp();
		next if /^#/ =~ buf;
		next if /^\*/ =~ buf;
		next if /^\s*$/ =~ buf;
		buf.force_encoding('ASCII-8BIT');
		begin
			pattern = Regexp::new(buf);
			return true if pattern =~ target_text;
		rescue RegexpError
			## ɽȤ
		end
	end
	return false;
end

# ======================================================================
# ѥΨȽȿʬ
#
def spam_filter(fp_in, user)

	# ================================================================
	# åե롣
	# ƱʣΥץmailbox_file˽񤭹ꤹ뤳ȤΤʤ褦
	# Τå򤫤뤬ԤǤȤˤ˿ʤࡣ
	#
	lock_fp = nil;
	rescue_point = '';
	begin
		lock_fp = open(AmConf['LockFile'], "a");
	rescue
		# 㳰ȯϥå򤫤˿ʤࡣ
		lock_fp = nil;
		rescue_point = 'open';
	else
		begin
			lock_fp.flock(File::LOCK_EX);
		rescue
			lock_fp.close();
			lock_fp = nil;
			rescue_point = 'flock';
		end
	end

	# ================================================================
	#
	dir = File::dirname(AmConf['OccurenceStore']);
	if ! File::directory?(dir)
		FileUtils::mkdir_p(dir);
	end

	[ 'BackupMailDir', 'CleanMailDir', 'WhiteMailDir', 'DirtyMailDir' ].each do |d|
		dir = AmConf[d];
		if ! File::directory?(dir)
			FileUtils::mkdir_p(dir);
		end
	end
	timestamp = Time::now.strftime("%Y%m%d_%H%M%S") + sprintf("_%05d", $$);
	mail_file_name = "mail_#{timestamp}.txt";

	# ================================================================
	# stdinΥ᡼롦åɤ߹ߤʤ顢
	# - Хååסե˽񤭽Ф
	# - From סFrom:׹Ԥ᡼롦ɥ쥹ФƤ
	# - Τ˶ǤʤԤФλݤϿ(ԤʤΤѴ)
	#
	exd = ExDecoder.new();
	exd.load_fp(fp_in);
	exd.store(AmConf['BackupMailDir'] + '/' + mail_file_name);

	lfp = open(AmConf['LogFile'], "a");
	lfp.print "** #{timestamp} ========================================\n";
	lfp.print exd.decoded_message;
	lfp.print "\n";
	lfp.print "** #{timestamp} ----------------------------------------\n";

	unix_from_line = exd.get_header('From ') || '';
	from_header = exd.get_header('From:') || '';
	subject = exd.get_header('Subject:') || "\n";
	received = exd.get_header('Received:') || '';
			# 'Received:' إåʣ硢1Ĥ᤬롣
	mail_body = exd.get_decoded_body() || '';

	unix_from = '';
	if /^(\S+) / =~ unix_from_line
		unix_from = $1;
	end
	mail_address = extract_mail_addr(from_header);

	non_empty_line_found = false;
	mail_body.each_line do |buf|
		if /^\s*$/ !~ buf
			non_empty_line_found = true;
		end
	end

	# ================================================================
	# SPAMȽꡣ
	#
	sp = SpamProb.new(subject + "\n" + mail_body, lfp);
	spam_prob = sp.probability;

	result = '';
	is_in_white_list = false;
	if is_in_white_list_pattern(unix_from, 'UfromWhiteList')
		result = sprintf("(unix_from whitelist) (%f)", spam_prob);
		is_in_white_list = true;
	elsif is_in_white_list_pattern(mail_address, 'FromWhiteList')
		result = sprintf("(mail_from whitelist) (%f)", spam_prob);
		is_in_white_list = true;
	elsif is_in_white_list_pattern(received, 'ReceivedWhiteList')
		result = sprintf("(received: whitelist) (%f)", spam_prob);
		is_in_white_list = true;
	elsif ! non_empty_line_found
		result = sprintf("(has only empty line) [%f]", spam_prob);
		spam_prob = 1.0;
	elsif spam_prob < AmConf['SpamThreshold'].to_f
		result = sprintf("S (%f)", spam_prob);
	else
		result = sprintf("S [%f]", spam_prob);
	end

	# ==============================================================
	# ۥ磻ȡꥹȤˤˤ⤫餺SPAMȽꤵƤޤäϡ
	# ǡ١롣
	#
	if is_in_white_list && AmConf['SpamThreshold'].to_f <= spam_prob
		tk = Tokenizer.new(subject + "\n" + mail_body);
		tk.update_occurences(:add_clean_sub_spam);
	end

	# ================================================================
	# ˾Ͽ롣
	#
	lfp.print "** [LOCK] 㳰: #{rescue_point} \n" if lock_fp == nil;
	lfp.print <<-"__LOG__";
** #{timestamp} / #{result} / #{unix_from} / #{mail_address}
** SubjectLine: #{subject}
	__LOG__

	lfp.close();

	# ================================================================
	#
	if is_in_white_list
		dir_mode = 'WhiteMailDir';
		add_header_mode = 'AddHeaderIfWhite';
	elsif spam_prob < AmConf['SpamThreshold'].to_f
		dir_mode = 'CleanMailDir';
		add_header_mode = 'AddHeaderIfClean';
	else
		dir_mode = 'DirtyMailDir';
		add_header_mode = 'AddHeaderIfDirty';
	end

	exd.store(AmConf[dir_mode] + '/' + mail_file_name);

	prob_header = AmConf[add_header_mode];
	if prob_header != nil && ! prob_header.empty?
		exd.add_header(prob_header);
	end

	# ================================================================
	# /var/mail/#{user}פɵ롣
	#
	mailbox_file = AmConf['MailBoxFile'];
	if is_in_white_list || spam_prob <= AmConf['SpamThreshold'].to_f
		if mailbox_file != nil && ! mailbox_file.empty?
			mfp = open(mailbox_file, "a");
			exd.store_fp(mfp);
			mfp.print "\n";
			mfp.close();
		else
			exd.store_fp($stdout);
		end
	end
			# -----------------------------------------------
			# /usr/libexec/mail.local Ȥ:
			# - From ׹Ԥơե˽񤭽ФƤ
			# - mail.local -f <ufrom> tac < mail.tmpפ
			#   ǸƤӽФ
			#
			# mail.localǤϽ񤭽Ф¤θʤɤ̩
			# äƤ뤬¼Ūʽexd.store_fp()Ʊ
			#

	# ===============================================================
	# å
	#
	begin
		if lock_fp != nil
			lock_fp.flock(File::LOCK_UN);
			lock_fp.close();
			File::unlink(AmConf['LockFile']);
		end
	rescue
		# 㳰ȯƤ̵뤷˿ʤࡣ
	end
end

# ======================================================================
# Unix Mbox ʣŻҥ᡼롦ƥȤޤȤ᤿ե
#
class Mbox

	# =============================================================
	#
	def initialize()
		@mbuf_array = [];
		@unix_from_indexes = [];
	end

	# =============================================================
	#
	def load(file_name)
		fp = open(file_name, "r");
		load_fp(fp);
		fp.close;
	end

	# =============================================================
	# Żҥ᡼ΥƥȤե뤫ɤ߹ߡԥɤɸಽ롣
	#
	def load_fp(fp)
		@mbuf_array = fp.readlines();
		split_by_ufrom();
	end

	# =============================================================
	# Żҥ᡼ΥƥȤʸȤɤ߹ࡣ
	#
	def load_array(buf_array)
		@mbuf_array = buf_array;
		split_by_ufrom();
	end

	# -------------------------------------------------------------
	# Unix MboxꤷƶڤԤ򸫤ĤƤ
	# - Ƭޤ϶ľΡFrom פ (mail.local˹碌)
	#
	def split_by_ufrom()

		@mbuf_array.each_index do |n|
			@mbuf_array[n] = @mbuf_array[n].chomp + "\n";
		end				# ԥɤɸಽ롣

		@unix_from_indexes = [ 0 ];	# From ׹Ԥʤѿ
		prev_is_empty_line = true;
		@mbuf_array.each_index do |n|
			buf = @mbuf_array[n].toeuc;
			if prev_is_empty_line && /^From / =~ buf
				@unix_from_indexes << n;
			end
			prev_is_empty_line = (buf == "\n");
		end
		@unix_from_indexes << @mbuf_array.size;	# 
		@unix_from_indexes.uniq!;
	end
	private :split_by_ufrom;

	# =============================================================
	#
	attr_reader	:unix_from_indexes;

	# =============================================================
	# From ׹ԤǶڤ줿ƥåˤĤƤΥƥ졼
	#
	def each_mbox()
		(0 ... @unix_from_indexes.size - 1).each do |i|
			from_index = @unix_from_indexes[i];
			to_index = @unix_from_indexes[i + 1];
			next if to_index <= from_index;

			yield @mbuf_array[from_index, to_index - from_index];
		end
	end
end

# ======================================================================
# ǡ١ι
# file_name: Unix MboxŻҥ᡼롦ե
# mode: :add_spam_sub_clean | :add_clean_sub_spam
#
def update_spam_db(file_name, mode)

	mb = Mbox.new();
	mb.load(file_name);
	mb.each_mbox do |mbuf_array|
		exd = ExDecoder.new();
		exd.load_array(mbuf_array);

		subject = exd.get_header('Subject:') || "\n";
		mail_body = exd.get_decoded_body() || '';

		tk = Tokenizer.new(subject + "\n" + mail_body);
		tk.update_occurences(mode);
	end
end

# ======================================================================
#
if __FILE__ == $0
	mode = :spam_filter;
	user = AmConf['User'] || 'user';

	# --------------------------------------------------------------
	#
	opts = OptionParser.new();

	opts.on("-f", "--spam-filter") do |opt|
		mode = :spam_filter;
	end
	opts.on("-s", "--add-spam-sub-clean") do |opt|
		mode = :add_spam_sub_clean;
	end
	opts.on("-c", "--add-clean-sub-spam") do |opt|
		mode = :add_clean_sub_spam;
	end
	opts.on("-e", "--export-token-db") do |opt|
		mode = :export_token_db;
	end
	opts.on("-i", "--import-token-db") do |opt|
		mode = :import_token_db;
	end
	opts.on("-v", "--version") do |opt|
		mode = :version;
	end

	opts.on("-u [USER]", "--user [USER]") do |opt|
		user = opt;
	end
	opts.on("-k [Key=Value]", "--key [Key=Value]") do |opt|
		if /(\w+?)\s*=\s*(\S*)/ =~ opt
			key = $1;
			value = $2 || '';
			AmConf[key] = value;
		end
	end

	opts.parse!(ARGV);

	# --------------------------------------------------------------
	#
	AmConf.each_pair do |key, value|
		if value.instance_of?(String)
			value = value.gsub(/%user%/, user);
		end
		AmConf[key] = value;
	end

	# --------------------------------------------------------------
	#
	case mode
	when :spam_filter
		spam_filter($stdin, user);
	when :add_spam_sub_clean, :add_clean_sub_spam
		ARGV.each do |file_name|
			update_spam_db(file_name, mode);
		end
	when :export_token_db
		Tokenizer::export();
	when :import_token_db
		Tokenizer::clear_probability();
		Tokenizer::import();
	when :version
		$stderr.print FemlVersion, "\n";
	end

end

