<?php

require_once dirname(dirname(__FILE__)) . '/Enum.class.php';

class Xoonips_Search {

	/**
	 * constant value for fulltext search data
	 * @var string
	 * @access private
	 */
	private $WINDOW_SIZE = Xoonips_Enum::XOONIPS_WINDOW_SIZE;

	/**
	 * regex patterns
	 * @var array
	 * @access private
	 */
	private $patterns;

	/**
	 * constractor
	 *
	 * @access public
	 */
	public function Xoonips_Search() {
		$this->initializePatterns();
	}

	/**
	 * get fulltext search sql by query string
	 *
	 * @access public
	 * @param string $field column name of table
	 * @param string $query search query
	 * @param string $encoding text encoding of search query
	 * @return string fulltext search sql
	 */
	public function getFulltextSearchSql($field, $query, $encoding) {
		// convert query encoding to 'UTF-8'
		if ($encoding != 'UTF-8') {
			$query = mb_convert_encoding($query, 'UTF-8', $encoding);
		}

		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$query = $this->normalizeString($query);

		// create fulltext search part of SQL
		return $this->makeFulltextSearchSql($field, $query);
	}

	/**
	 * get search sql by query string
	 *
	 * @access public
	 * @param string $field column name of table
	 * @param string $query search query
	 * @param object $dataType DataType class
	 * @param string $encoding text encoding of search query
	 * @param bool $isExact true:exact search
	 * @return string search sql
	 */
	public function getSearchSql($field, $query, $dataType, $isExact) {
		// detect query encoding
		$encoding = mb_detect_encoding($query);

		// convert query encoding to 'UTF-8'
		if ($encoding != 'UTF-8') {
			$query = mb_convert_encoding($query, 'UTF-8', $encoding);
		}

		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$query = $this->normalizeString($query);
		
		// create fulltext search part of SQL
		return mb_convert_encoding($this->makeSearchSql($field, $query, $dataType, $isExact), $encoding, 'UTF-8');
	}

	/**
	 * get fulltext data for storing into database
	 *
	 * @access public
	 * @param string $text UTF-8 encoded text
	 * @return string UTF-8 encoded fulltext data
	 */
	public function getFulltextData($text) {
		// set multi byte regex encoding
		mb_regex_encoding('UTF-8');

		// normalize string for fulltext search
		$text = $this->normalizeString($text);

		// split text to search tokens
		$tokens = $this->splitIntoTokens($text);

		// get fulltext search data
		$data = $this->makeFulltextSearchData($tokens, false);

		return $data;
	}

	/**
	 * normalize string for fulltext search
	 * @access private
	 * @param string $text 'UTF-8' encoded input text
	 * @return string normalized string
	 */
	private function normalizeString($text) {

		// convert html character entities to numeric entities
		$text = Xoonips_Utils::htmlNumericEntities($text);

		// convert all html numeric entities to UTF-8 character
		$text = mb_decode_numericentity($text, array(0x0, 0xffff, 0, 0xffff), 'UTF-8');

		// sanitize non printable characters
		$pattern = sprintf('%s+', $this->patterns['noprint']);
		$text = mb_ereg_replace($pattern, ' ', $text);

		// normalize Japanese characters
		$text = mb_convert_kana($text, 's', 'UTF-8');

		// convert latin1 suppliment characters to html numeric entities
		$text = mb_encode_numericentity($text, array(0x0080, 0x00ff, 0, 0xffff), 'UTF-8');

		// trim string
		$text = trim($text);

		return $text;
	}

	/**
	 * make fulltext search sql
	 * @access private
	 * @param string $field column name of table
	 * @param string $query search query
	 * @return string fulltext search sql
	 */
	private function makeFulltextSearchSql($field, $query) {
		$tokens = array();
		while (mb_ereg($this->patterns['casearc'], $query)) {
			mb_ereg_search_init($query, $this->patterns['casearc']);
			mb_ereg_search_setpos(0);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			if (mb_ereg('\\x22', $regs[0])) {
				$tokens[] = $this->makeFulltextSearchData($this->splitIntoTokens(mb_ereg_replace('\\x22', '', $regs[0])), true);
			} else {
				$this->makeFulltextSearchSqlImpl(mb_ereg_replace('\\x28|\\x29', '', $regs[0]), $tokens);
			}
			mb_ereg_search_init($query);
			$pos = mb_ereg_search_pos($this->patterns['casearc']);
			$text = mb_substr($query, 0, $pos[0]) . '{' . count($tokens) . '}';
			$query = $text . mb_substr($query, $pos[0] + mb_strlen($regs[0]), mb_strlen($query) - mb_strlen($regs[0]) - $pos[0]);
		}
		$this->makeFulltextSearchSqlImpl(mb_ereg_replace('\\x28|\\x29', '', $query), $tokens);
		$query = $tokens[count($tokens) - 1];
		$query = mb_ereg_replace('\\x7b', '\\x28', $query);
		$query = mb_ereg_replace('\\x7d', '\\x29', $query);
		return sprintf('MATCH ( %s ) AGAINST ( \'%s\' IN BOOLEAN MODE)', $field, $query);
	}

	/**
	 * make fulltext search criteria implement
	 * @access private
	 * @param string $text search query in case arc
	 * @param array $tokens array of fulltext search criteria in case arc
	 */
	private function makeFulltextSearchSqlImpl($text, &$tokens) {
		$search_op = 0;
		$words = $this->splitTextBySpace($text);
		if (count($words) > 2 && count($words) % 2 == 1) {
			$op_word = mb_strtoupper($words[1], 'UTF-8');
			if ($op_word == 'AND') {
				$search_op = 1;
			} else if ($op_word == 'OR') {
				$search_op = 2;
			}
			for ($i = 3; $i < count($words); $i = $i + 2) {
				if ($op_word != mb_strtoupper($words[$i], 'UTF-8')) {
					$search_op = 0;
				}
			}
		}
				
		$token = '(';
		for ($i = 0; $i < count($words); $i++) {
			if ($i % 2 == 1) {
				$op_word = mb_strtoupper($words[$i], 'UTF-8');
				if (($search_op == 1 && $op_word == 'AND') || ($search_op == 2 && $op_word == 'OR')) {
					continue;
				}
			}
			if ($i > 0) {
				$token .= ' ';
			}
			if ($search_op == 1 || $search_op == 0) {
				$token .= '+';
			}
			if (mb_ereg('\\x7b[^\\x7b]+\\x7d', $words[$i])) {
				$pos = mb_ereg_replace('\\x7b|\\x7d', '', $words[$i]);
				$token .= $tokens[$pos - 1];
			} else {
				$token .= $this->makeFulltextSearchData($this->splitIntoTokens($words[$i]), true);
			}
		}
		$token .= ')';
		$tokens[] = $token;	
	}
	/**
	 * make search sql
	 * @access private
	 * @param string $field column name of table
	 * @param string $query search query
	 * @return string fulltext search sql
	 */
	private function makeSearchSql($field, $query, $dataType, $isExact) {
		$tokens = array();
		while (mb_ereg($this->patterns['casearc'], $query)) {
			mb_ereg_search_init($query, $this->patterns['casearc']);
			mb_ereg_search_setpos(0);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			if (mb_ereg('\\x22', $regs[0]) && !mb_ereg('\\x22t1\\x22', $regs[0])) {
				$tokens[] = mb_ereg_replace('\\x22', '', $regs[0]);
			} else {
				$this->makeSearchSqlImpl($field, mb_ereg_replace('\\x28|\\x29', '', $regs[0]), $tokens, $dataType, $isExact);
			}
			mb_ereg_search_init($query);
			$pos = mb_ereg_search_pos($this->patterns['casearc']);
			$text = mb_substr($query, 0, $pos[0]) . '{' . count($tokens) . '}';
			$query = $text . mb_substr($query, $pos[0] + mb_strlen($regs[0]), mb_strlen($query) - mb_strlen($regs[0]) - $pos[0]);
		}
		$this->makeSearchSqlImpl($field, mb_ereg_replace('\\x28|\\x29', '', $query), $tokens, $dataType, $isExact);
		$query = $tokens[count($tokens) - 1];
		$query = mb_ereg_replace('\\x7b', '\\x28', $query);
		$query = mb_ereg_replace('\\x7d', '\\x29', $query);
		return $query;
	}

	/**
	 * make search criteria implement
	 * @access private
	 * @param string $text search query in case arc
	 * @param array $tokens array of fulltext search criteria in case arc
	 */
	private function makeSearchSqlImpl($field, $text, &$tokens, $dataType, $isExact) {
		$search_op = 0;
		if ($isExact) {
			$words = array($text);
		} else {
			$words = $this->splitTextBySpace($text);
		}
		if (count($words) > 2 && count($words) % 2 == 1) {
			$op_word = mb_strtoupper($words[1], 'UTF-8');
			if ($op_word == 'AND') {
				$search_op = 1;
			} else if ($op_word == 'OR') {
				$search_op = 2;
			}
			for ($i = 3; $i < count($words); $i = $i + 2) {
				if ($op_word != mb_strtoupper($words[$i], 'UTF-8')) {
					$search_op = 0;
				}
			}
		}
				
		$token = '(';
		for ($i = 0; $i < count($words); $i++) {
			if ($i % 2 == 1) {
				$op_word = mb_strtoupper($words[$i], 'UTF-8');
				if (($search_op == 1 && $op_word == 'AND') || ($search_op == 2 && $op_word == 'OR')) {
					continue;
				}
			}
			if ($i > 0) {
				if ($search_op == 1 || $search_op == 0) {
					$token .= ' AND ';
				} else {
					$token .= ' OR ';
				}
			}
			if (mb_ereg('\\x7b[^\\x7b]+\\x7d', $words[$i])) {
				$pos = mb_ereg_replace('\\x7b|\\x7d', '', $words[$i]);
				$token .= $tokens[$pos - 1];
			} else {
				if ($dataType->isLikeSearch() === true) {
					$value = $dataType->convertSQLStrLike($words[$i]);
					$token .= "\"t1\".$field like '%$value%'";
				} else if ($dataType->isNumericSearch() === false) {
					$value = $dataType->convertSQLStr($words[$i]);
					$token .= "\"t1\".$field=$value";
				} else {
					$value = $dataType->convertSQLNum($words[$i]);
					$token .= "\"t1\".$field=$value";
				}
			}
		}
		$token .= ')';
		$tokens[] = $token;	
	}

	/**
	 * split text into tokens
	 *
	 * @access private
	 * @param string $text UTF-8 encoded text
	 * @return array array of word not inclueded space
	 */
	private function splitTextBySpace($text) {
		mb_ereg_search_init($text, '[^\\x20]+');

		$tokens = array();
		$len = strlen($text);
		for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) {
			mb_ereg_search_setpos($i);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			$tokens[] = $regs[0];
		}

		return $tokens;
	}
	
	/**
	 * split text into tokens
	 *
	 * @access private
	 * @param string $text UTF-8 encoded text
	 * @return array array of token
	 */
	private function splitIntoTokens($text) {
		$pattern = sprintf('%s|%s', $this->patterns['sbword'], $this->patterns['mbword']);
		mb_ereg_search_init($text, $pattern);

		$tokens = array();
		$len = strlen($text);
		for ($i = 0; $i < $len; $i = mb_ereg_search_getpos()) {
			mb_ereg_search_setpos($i);
			$regs = mb_ereg_search_regs();
			if ($regs === false) {
				break;
			}
			$tokens[] = $regs[0];
		}

		return $tokens;
	}

	/**
	 * make fulltext search data
	 * @access private
	 * @param array $tokens UTF-8 encoded fulltext search tokens
	 * @param bool $isSql if this flag is true enclose search data in double quotation 
	 * @return string UTF-8 encoded fulltext search data
	 */
	private function makeFulltextSearchData($tokens, $isSql) {
		$ngram = array();
		foreach ($tokens as $token) {
			if ($this->isMultibyteWord($token)) {
				$ngramtokens = $this->ngram($token, $this->WINDOW_SIZE, !$isSql);
				foreach ($ngramtokens as $ngramtoken) {
					$ngram[] = bin2hex($ngramtoken);
				}
			} else {
				$ngram[] = $token;
			}
		}
		if ($isSql) {
			return '"' . implode(' ', $ngram) . '"';
		} else {
			return implode(' ', $ngram);
		}
	}

	/**
	 * get array of N-gram applied string
	 *
	 * @access private
	 * @param string $text input string
	 * @param int $n window size
	 * @param bool $trailing flag for output trailing
	 * @return array array of N-gram applied string
	 */
	private function ngram($text, $n, $trailing) {
		$tokens = array();
		$len = mb_strlen($text, 'UTF-8');
		for ($i = 0; $i + $n <= $len; $i++) {
			$tokens[] = mb_substr($text, $i, $n, 'UTF-8');
		}
		if ($trailing) {
			$pos = ($len - $n + 1) > 0 ? ($len - $n + 1) : 0;
			for ($i = $pos; $i < $len; $i++) {
				$tokens[] = mb_substr($text, $pos, $len - $i, 'UTF-8');
			}
		}
		return $tokens;
	}

	/**
	 * return true if multibyte word
	 * @access private
	 * @param string $token 'UTF-8' encoded word
	 * @return bool true if multibyte word
	 */
	private function isMultibyteWord($token) {
		$result = mb_ereg($this->patterns['mbword'], $token);
		return $result !== false;
	}

	/**
	 * initialize regex patterns
	 * @access private
	 */
	private function initializePatterns() {
		$mb_delimiter = array(
			array(0xe3, 0x80, 0x81),	// ,
			array(0xe3, 0x80, 0x82),	// .
			array(0xe2, 0x80, 0x99),	// '
			array(0xe2, 0x80, 0x9d),	// "
			array(0xe3, 0x83, 0xbb),	// centered dot
			array(0xe3, 0x80, 0x8a),	// case arc
			array(0xe3, 0x80, 0x8b),	// case arc
			array(0xe3, 0x80, 0x8c),	// case arc
			array(0xe3, 0x80, 0x8d),	// case arc
			array(0xe3, 0x80, 0x8e),	// case arc
			array(0xe3, 0x80, 0x8f),	// case arc
			array(0xe3, 0x80, 0x90),	// case arc
			array(0xe3, 0x80, 0x91),	// case arc
			array(0xe3, 0x80, 0x94),	// case arc
			array(0xe3, 0x80, 0x95)		// case arc
		);

		// non printable characters
		$patterns['noprint'] = sprintf('[\\x00-\\x1f\\x7f%s]',
			Xoonips_Utils::getCodeToLatin1(0x80, 0x9f));
		// single byte word
		$patterns['sbword'] = sprintf('[0-9a-zA-Z\\x27%s%s%s]+',
			Xoonips_Utils::getCodeToLatin1(0xc0, 0xd6),
			Xoonips_Utils::getCodeToLatin1(0xd8, 0xf6),
			Xoonips_Utils::getCodeToLatin1(0xf8, 0xff));
		// multi byte word
		$patterns['mbword'] = sprintf('[^\\x00-\\x7f%s%s]+',
			Xoonips_Utils::getCodeToLatin1(0x80, 0xff),
			Xoonips_Utils::getCodeToUtf8($mb_delimiter));
		// case arc
		$patterns['casearc'] = '\\x22[^\\x22]+\\x22|\\x28[^\\x28\\x29]+\\x29';

		$this->patterns = $patterns;
	}
}

