#include <string>
#include <vector>
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <set>

#include "libxml/HTMLParser.h"
#include "libxml/HTMLTree.h"
#include "libxml/uri.h"

#include "mecab.h"
#include "scoped_ptr.h"

#include "sqlite3.h"
#include "fts3_tokenizer.h"

struct xmlDocRAII{
	xmlDocPtr ptr;
	xmlDocRAII(xmlDocPtr ptr_){ ptr = ptr_; }
	~xmlDocRAII(){ if (ptr) xmlFreeDoc(ptr); }
	operator xmlDocPtr(){ return ptr; }
private:
	xmlDocRAII();
	xmlDocRAII(const xmlDocRAII &);
	xmlDocRAII &operator =(const xmlDocRAII &);
};

struct xmlRAII{
	void *ptr;
	xmlRAII(xmlChar * ptr_){ ptr = ptr_; }
	~xmlRAII(){ if (ptr) xmlFree(ptr); }
	operator void *(){ return ptr; }
private:
	xmlRAII();
	xmlRAII(const xmlRAII &);
	xmlRAII&operator =(const xmlRAII &);
};

// ../  ./ BSďɕςB#B
std::string parse_url(const std::string &url_toparse, const std::string &url_base = ""){
	std::size_t np = std::string::npos;
	std::string url_result;
	std::size_t curpos = 0, url_result_prevpos = std::string::npos;
	for(;;){
		std::size_t nextpos = url_toparse.find_first_of("/", curpos);
		std::string url_part = url_toparse.substr(curpos, nextpos == np ? np : nextpos-curpos+1);

		if (url_part.size() == 0) break;
		curpos = nextpos + 1;
		for (size_t i = 0; i < url_part.size(); ++i) url_part[i] = std::tolower(url_part[i]);
		if (url_part == "../" && url_result_prevpos != std::string::npos){
			if (url_result == url_base) continue;
			url_result = url_result.substr(0, url_result_prevpos);
			url_result_prevpos = url_result.find_last_of("/", 0, url_result.size()-1);
			if (url_result_prevpos != std::string::npos) --url_result_prevpos;
		}else if (url_part == "./") continue;
		else{
			url_result_prevpos = url_result.size();
			url_result += url_part;
		}
		if (nextpos == np) break;
	}
	std::size_t sharp = url_result.find_first_of("#");
	if (sharp != np) return url_result.substr(0, sharp);
	return url_result;
}

// ==== SQLITE3 MECABg[JiCUo^邽߂̊֐QAyэ\
namespace MecabTokenizer{
#if 0
struct Tokenizer{
	sqlite3_tokenizer base;
	mecab_t *mecab;
};
struct Cursor{
	sqlite3_tokenizer_cursor base;
	const mecab_node_t *node;
	char *buf;
	int buflen, offset, pos;
};

int create(int argc, const char * const *argv, sqlite3_tokenizer **otokbase){
	int rc = SQLITE_OK;

	Tokenizer *tok = static_cast<Tokenizer *>(std::calloc(1, sizeof(Tokenizer)));
	if (!tok){ rc = SQLITE_NOMEM; goto ONERROR;};
	tok->mecab = mecab_new(argc, const_cast<char **>(argv));
	if (!tok->mecab) { rc = SQLITE_ERROR; goto ONERROR;};

	*otokbase = reinterpret_cast<sqlite3_tokenizer *>(tok);
	return rc;

ONERROR:
	std::free(tok);
	return rc;
}

int destroy( sqlite3_tokenizer *tokbase ){
	Tokenizer *tok = reinterpret_cast<Tokenizer *>(tokbase);
	mecab_destroy(tok->mecab);
	std::free(tok);
	return SQLITE_OK;
}

int open( sqlite3_tokenizer *tokbase, const char *input, int len, sqlite3_tokenizer_cursor **ocur ) {
	static const int DEFAULT_CURSOR_BUF = 256;
	int rc = SQLITE_OK;

	Tokenizer *tok = reinterpret_cast<Tokenizer *>(tokbase);
	Cursor *cur = static_cast<Cursor *>(std::calloc(1, sizeof(Cursor)));
	if (!cur){ rc = SQLITE_NOMEM; goto ONERROR;};
	cur->buf = static_cast<char *>(std::malloc(DEFAULT_CURSOR_BUF));
	if (!cur->buf){ rc = SQLITE_NOMEM; goto ONERROR;};
	cur->buflen = DEFAULT_CURSOR_BUF;
	cur->offset = cur->pos = 0;
	cur->node = mecab_sparse_tonode2(tok->mecab, input, std::strlen(input)+1);
	if (!cur->node) { rc = SQLITE_ERROR; goto ONERROR;};

	*ocur = reinterpret_cast<sqlite3_tokenizer_cursor *>(cur);
	return rc;

ONERROR:
	if (cur){
		std::free(cur->buf);
		std::free(cur);
	}
	*ocur = 0;
	return rc;
}

int close( sqlite3_tokenizer_cursor *curbase ){
	Cursor *cur = reinterpret_cast<Cursor *>(curbase);
	std::free(cur->buf);
	std::free(cur);
	return SQLITE_OK;
}

int next( sqlite3_tokenizer_cursor *curbase, const char **otok, int *olen, int *ostartoffset, int *oendoffset, int *opos){
	Cursor *cur = reinterpret_cast<Cursor *>(curbase);

	const mecab_node_t *node = cur->node;

	// length0̃m[h΂
	while(node->next && node->length == 0) node = node->next;

	// obt@̃TCY͂ꂽm[h̃TCYȏɂ
	if (node->length > cur->buflen){
		cur->buf = static_cast<char *>(std::realloc(cur->buf, node->length + 1));
		cur->buflen = node->length;
	}

	// m[h̒gJ[\ɃRs[
	std::strncpy(cur->buf, node->surface, node->length);
	cur->buf[node->length] = '\0';

	// ݂̃J[\ێĂm[h̏߂
	*otok = cur->buf;
	*olen = node->length;
	*ostartoffset = cur->offset;
	*oendoffset = cur->offset + node->length;
	*opos = cur->pos++;

	if (!node->next) return SQLITE_DONE;

	// J[\ɐi߂
	cur->node = node->next;
	cur->offset += node->rlength;

	return SQLITE_OK;
}
#else
struct Tokenizer{
	sqlite3_tokenizer base;
	MeCab::Tagger *tagger;
	const char *top;
	std::vector<char> token;
};
struct Cursor{
	sqlite3_tokenizer_cursor base;
	const MeCab::Node *node;
};

int create(int argc, const char * const *argv, sqlite3_tokenizer **otokbase){
	int rc = SQLITE_OK;

	Tokenizer *tok = new Tokenizer();
	tok->tagger = MeCab::createTagger(argc, const_cast<char **>(argv));
	if (!tok->tagger) { rc = SQLITE_ERROR; goto ONERROR;};

	*otokbase = reinterpret_cast<sqlite3_tokenizer *>(tok);
	return rc;

ONERROR:
	delete tok;
	return rc;
}

int destroy( sqlite3_tokenizer *tokbase ){
	Tokenizer *tok = reinterpret_cast<Tokenizer *>(tokbase);
	if (tok) MeCab::deleteTagger(tok->tagger);
	delete tok;
	return SQLITE_OK;
}

int open( sqlite3_tokenizer *tokbase, const char *input, int len, sqlite3_tokenizer_cursor **ocur ) {
	int rc = SQLITE_OK;

	Tokenizer *tok = reinterpret_cast<Tokenizer *>(tokbase);
	Cursor *cur = new Cursor();

	cur->node = tok->tagger->parseToNode(input);
	if (!cur->node) { rc = SQLITE_ERROR; goto ONERROR;};
	tok->top = cur->node->surface;

	*ocur = reinterpret_cast<sqlite3_tokenizer_cursor *>(cur);
	return rc;

ONERROR:
	delete cur;
	*ocur = 0;
	return rc;
}

int close( sqlite3_tokenizer_cursor *curbase ){
	Cursor *cur = reinterpret_cast<Cursor *>(curbase);
	delete cur;
	return SQLITE_OK;
}

int next( sqlite3_tokenizer_cursor *curbase, const char **otok, int *olen, int *ostartoffset, int *oendoffset, int *opos){
	Cursor *cur = reinterpret_cast<Cursor *>(curbase);
	Tokenizer *tok = reinterpret_cast<Tokenizer *>(cur->base.pTokenizer);

	const MeCab::Node *node = cur->node;

	// length0̃m[h΂
	while(node->next && node->length == 0) node = node->next;

	tok->token.resize(node->length+1);

	// m[h̒gJ[\ɃRs[
	std::strncpy(&tok->token[0], node->surface, node->length);
	tok->token[node->length] = '\0';

	// ݂̃J[\ێĂm[h̏߂
	*otok = &tok->token[0];
	*olen = node->length;
	*ostartoffset = node->surface - tok->top;
	*oendoffset = *ostartoffset + node->length;
	*opos = 0;

	if (!node->next) return SQLITE_DONE;

	// J[\ɐi߂
	cur->node = node->next;

	return SQLITE_OK;
}
#endif
const sqlite3_tokenizer_module mod = {
    0,
	MecabTokenizer::create,
    MecabTokenizer::destroy,
    MecabTokenizer::open,
    MecabTokenizer::close,
    MecabTokenizer::next,
};
}
// ====

// MecabTokenizer  SQLITE3 ɓo^
int register_mecab(sqlite3 *db){
	int rc;

	sqlite3_stmt *stmt_reg_tok = 0;
	if ((rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?, ?)", -1, &stmt_reg_tok, 0)) != SQLITE_OK) return rc;
	if ((rc = sqlite3_bind_text(stmt_reg_tok, 1, "mecab", -1, SQLITE_STATIC)) != SQLITE_OK) return rc;
	const sqlite3_tokenizer_module *p = &MecabTokenizer::mod;
	if ((rc = sqlite3_bind_blob(stmt_reg_tok, 2, &p, sizeof(p), SQLITE_STATIC)) != SQLITE_OK) return rc;
	if ((rc = sqlite3_step(stmt_reg_tok)) != SQLITE_ROW) return rc;
	if ((rc = sqlite3_finalize(stmt_reg_tok)) != SQLITE_OK) return rc;

	if ((rc = sqlite3_prepare_v2(db, "SELECT fts3_tokenizer(?)", -1, &stmt_reg_tok, 0)) != SQLITE_OK) return rc;
	if ((rc = sqlite3_bind_text(stmt_reg_tok, 1, "mecab", -1, SQLITE_STATIC)) != SQLITE_OK) return rc;
	if ((rc = sqlite3_step(stmt_reg_tok)) != SQLITE_ROW) return rc;
	if ((rc = sqlite3_finalize(stmt_reg_tok)) != SQLITE_OK) return rc;

	return rc;
}

void parse_and_create_db(char *url_start, char *db_name){
	std::vector<char> filebuf;

	int rc;

	// db쐬
	char *rstr = 0;
	sqlite3 *db = 0;
	sqlite3_open_v2(db_name, &db, SQLITE_OPEN_CREATE | SQLITE_OPEN_READWRITE, 0);

	rc = register_mecab(db);

	// e[u쐬
	rc = sqlite3_exec(db,
		"CREATE VIRTUAL TABLE tbl USING FTS3 ("
		"  url TEXT, title TEXT, str TEXT, tokenize mecab '-d' 'sys.dic' '-r' 'dicrc' '-Owakati' )",
		0, 0, &rstr);

	// parseJn
	std::set<std::string> searched_url;
	sqlite3_exec(db, "BEGIN;", 0, 0, &rstr);

	sqlite3_stmt *stmt_insert = 0;
	rc = sqlite3_prepare_v2(db, "INSERT INTO tbl VALUES (?, ?, ?)", -1, &stmt_insert, 0);
	if (rc != SQLITE_OK) return;


#if _WIN32
	std::string url_bs2s = url_start;
	for (size_t i = 0; i < url_bs2s.size(); ++i){
		if (url_bs2s[i] == '\\') url_bs2s[i] = '/';
	}
#else
	std::string url_bs2s = url_start;
#endif
	std::vector<std::pair<std::string, std::string> > urles_toparse;

	// p[XΏۂ̃fBNg
	std::size_t sp = url_bs2s.find_last_of("/");
	std::string url_base;
	if (sp != std::string::npos){
		url_base = url_bs2s.substr(0, sp+1);
		urles_toparse.push_back(std::make_pair(url_base, url_bs2s.substr(sp+1)));
	}else{
		urles_toparse.push_back(std::make_pair("", url_bs2s));
	}

	char *mecab_argv[] = {
		"-d", "sys.dic", "-r", "dicrc", "-Owakati"
	};
	MeCab::scoped_ptr<MeCab::Tagger> tagger(MeCab::createTagger(5, mecab_argv));

//	int count = 0;
	while(urles_toparse.size()){
//		++count;
//		if (count == 50) break;
		std::string url_toparse = urles_toparse.back().second;
		std::string curdir = urles_toparse.back().first;

		urles_toparse.resize(urles_toparse.size()-1);

		// p[XΏۂ̃fBNgύX
		size_t pos = url_toparse.find_last_of("/");
		if (pos != std::string::npos){
			curdir = parse_url(curdir + url_toparse.substr(0, pos+1), url_base);
			url_toparse = curdir + url_toparse.substr(pos + 1);
		}else{
			url_toparse = curdir + url_toparse;
		}
		if (searched_url.find(url_toparse) != searched_url.end()){
//			printf("==== %s parsed\n", url_toparse.c_str());
			continue;
		}

		xmlDocRAII htmlDoc = htmlParseFile(url_toparse.c_str(), 0);
		if (!htmlDoc.ptr) continue;

//		std::printf("==== %s parsing\n", url_toparse.c_str());
		std::fprintf(stderr, "==== %s parsing\n", url_toparse.c_str());
		searched_url.insert(url_toparse);

		htmlNodePtr node = xmlDocGetRootElement(htmlDoc);
		std::vector<htmlNodePtr> nodeStack;
		nodeStack.push_back(node);

		std::string title, content;
		// ehtmlp[X
		while(nodeStack.size()){
			htmlNodePtr node = nodeStack.back();
			nodeStack.resize(nodeStack.size()-1);
			if (node->type == XML_TEXT_NODE){
				xmlChar *s = xmlNodeListGetString(htmlDoc, node, 1);
				xmlRAII xr(s);
				std::string piece = reinterpret_cast<const char *>(s);
				for (; *s; ++s){
					if (s[0] == 0xc2 && s[1] == 0x80) content += ' ', ++s;
					else if (s[0] >= 0x80 || std::isprint(s[0])) content += static_cast<char>(*s);
				}
				if (xmlStrcasecmp(node->parent->name, reinterpret_cast<const xmlChar *>("TITLE")) == 0){
					title = piece;
				}
			}else{
				if (node->type == XML_ELEMENT_NODE && xmlStrcasecmp(node->name, reinterpret_cast<const xmlChar *>("A")) == 0){
					for (xmlAttrPtr attr = node->properties; attr; attr = attr->next){
						if (xmlStrcasecmp(attr->name, reinterpret_cast<const xmlChar *>("HREF"))) continue;
						if (std::strchr(reinterpret_cast<const char *>(attr->children->content), ':')) continue;
						urles_toparse.push_back(
							std::make_pair(curdir, reinterpret_cast<const char *>(attr->children->content)));
						std::size_t sharp = urles_toparse.back().second.find_first_of("#");
						if (sharp != std::string::npos) urles_toparse.back().second = urles_toparse.back().second.substr(0, sharp);
					}
				}
			}
			if (node->next) nodeStack.push_back(node->next);
			if (node->children) nodeStack.push_back(node->children);
		}
//		printf("%s => \n", content.c_str());
		std::fprintf(stderr, "  title:%s\n", title.c_str());
		rc = sqlite3_reset(stmt_insert);
		std::string url_to_db = (url_base == url_toparse.substr(0, url_base.size())) ? url_toparse.substr(url_base.size()) : url_toparse;
		rc = sqlite3_bind_text(stmt_insert, 1, url_to_db.c_str(), -1, SQLITE_TRANSIENT);
		rc = sqlite3_bind_text(stmt_insert, 2, title.c_str(), -1, SQLITE_TRANSIENT);
		rc = sqlite3_bind_text(stmt_insert, 3, content.c_str(), -1, SQLITE_TRANSIENT);
		rc = sqlite3_step(stmt_insert);
					
//		printf(" - %s\n", tagger->parse(content.c_str()));
	}

	xmlCleanupParser();
	xmlCleanupCharEncodingHandlers();
	sqlite3_finalize(stmt_insert);
	sqlite3_exec(db, "commit;", 0, 0, &rstr);
	sqlite3_close(db);
}

int query(char *db_name, const char *query_str){
	int rc;
	sqlite3 *db;

	rc = sqlite3_open_v2(db_name, &db, SQLITE_OPEN_READONLY, 0);

	rc = register_mecab(db);

	sqlite3_stmt *stmt_query = 0;
	if ((rc = sqlite3_prepare_v2(db, "SELECT url, title, str from tbl where str match ?", -1, &stmt_query, 0)) != SQLITE_OK) return rc;
	if ((rc = sqlite3_bind_text(stmt_query, 1, query_str, std::strlen(query_str), SQLITE_TRANSIENT)) != SQLITE_OK) return rc;
	while((rc = sqlite3_step(stmt_query)) == SQLITE_ROW){
		const char *rstr1 = reinterpret_cast<const char *>(sqlite3_column_text(stmt_query, 0));
		const char *rstr2 = reinterpret_cast<const char *>(sqlite3_column_text(stmt_query, 1));
		const char *rend = rstr2 + std::strlen(rstr2);
		std::string rstr2_enc;
		int len = 0;
		for (const char *iter = rstr2; *iter; iter += len){
			len = rend - iter;
			int code = xmlGetUTF8Char(reinterpret_cast<const unsigned char *>(iter), &len);
			if (code == -1){
				rstr2_enc = rstr2;
				break;
			}
			if (len == 1) rstr2_enc += *iter;
			else{
				char buf[9]; std::sprintf(buf, "&#x%04X;", code);
				rstr2_enc += buf;
			}
		}
		
		printf("<li><a href=\"%s\" target=frame>%s</a></li>\n", rstr1, rstr2_enc.c_str());
//		const char *rstr3 = reinterpret_cast<const char *>(sqlite3_column_text(stmt_query, 1));
//		printf("content:%s\n", rstr3);
	}

	rc = sqlite3_finalize(stmt_query);

	rc = sqlite3_close(db);
	return rc;
}

int main(int argc, char *argv[]){
	if (argc < 4){
		return 0;
	}
	if (std::strcmp(argv[1], "create_db") == 0){
		if (argc < 4) return 0;
		parse_and_create_db(argv[2], argv[3]);
	}else if (std::strcmp(argv[1], "query") == 0){
		if (argc < 4) return 0;
		std::string query_str;
		for (int i = 3; i < argc; ++i){
			char *unescaped = xmlURIUnescapeString(argv[i], 0, 0);
			query_str += unescaped;
			xmlMemFree(unescaped);

			if (i < argc - 1) query_str += " ";
		}
		query(argv[2], query_str.c_str());
	}

	return 0;
}
