
// Copyright Mocchi 2019
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt)

#include "tidy.h"
#include "tidybuffio.h"

#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <map>

// e
// [(1)] 1߂̃R}hCŎhtmlt@CǂݍށB
// [(2)] ADD_HTMLBODYTAG `ĂƂ́A ǂݍ񂾃t@Ce <html>^OA<body>^OŊB
// [(3)] html tidy Ńp[XB
// [(4)] p[Xm[h[DTA table^OƂ tablei[z(etdm[hAт璊oi[邽߂2z)𐶐B
// [(5)] 擾e td m[hAParseTD ֐ŏ𒊏oB

/// ======================
/// ===== 񏈗 =====
/// ======================
// obt@ɓĂkI[񂩂AsB
void remove_cr(TidyBuffer &buf){
	byte *str_t = buf.bp, *str_s = buf.bp;
	while(*str_t){
		while (*str_s && (*str_s == '\n' || *str_s == '\r')){
			++str_s;
		}
		if (str_t < str_s){
			*str_t = *str_s;
		}
		++str_s;
		++str_t;
	}
	if (str_s > str_t){
		*str_t = '\0';
	}
}

// w肳ꂽ񒷕Aobt@ɒǋLAkI[ǉB
void append_nstring(TidyBuffer &dest, const char *str, size_t len){

	if (dest.size > 0 && dest.bp[dest.size-1] == '\0') dest.size--;
	// destɂkI[̈ʒuIɕtւ鏈Ăappend֐gpꍇ́AL͕sv

	tidyBufAppend(&dest, const_cast<char *>(str), len);
	tidyBufPutByte(&dest, '\0');
}

// kI[ str ̕񒷂𒲂ׁÃkI[܂łobt@ɒǋLB
void append_string(TidyBuffer &dest, const char *str){
	append_nstring(dest, str, std::strlen(str));
}

// _uNH[gŊꂽ͈͂𒊏oAI[k𑫂`Ńobt@Ɋi[B
// ߂lƂĕNH[g̎̃̕|C^ԂB
// ꂽ͈͂Ȃꍇ̓obt@ɂ͉ NULL ԂB
const char *extract_quoted_string(const char *src, TidyBuffer &dest){
	const char *p1 = std::strchr(src, '"');
	if (!p1) return 0;
	const char *p2 = std::strchr(p1+1, '"');
	if (!p2) return 0;
	size_t sz = p2 - p1;
	append_nstring(dest, const_cast<char *>(p1 + 1), sz - 1);
	return p2+1;
}

/// ==============================
/// ===== c[\͏ =====
/// ==============================
typedef std::pair<TidyNode, TidyBuffer> td_type;
typedef std::vector<td_type> tr_type;
typedef std::vector<tr_type> table_type;

// [DT
TidyNode DepthFirstNext(TidyNode cur, bool skip_child, int &depth){
	if (!skip_child){
		// ܂Aqm[hT
		TidyNode child = tidyGetChild(cur);
		if (child){
			depth++;
			return child;
		}
	}
	// Ȃꍇ͎̌Zm[hT
	TidyNode next = tidyGetNext(cur);
	if (next) return next;

	// Ȃꍇ͐e̎̌Zm[hꍇ͂AȂꍇ͂ɐeT
	for(;;){
		TidyNode parent = tidyGetParent(cur);
		depth--;
		if (!parent){
			cur = 0;
			break;
		}
		TidyNode parent_next = tidyGetNext(parent);
		if (parent_next){
			cur = parent_next;
			break;
		}
		else cur = parent;
	}
	return cur;
}

// TD^OKvȏ𒊏o鏈
// L̂悤Ȍ`ŏ𒊏oꍇ̗
// <td><input type="button" onclick="open_form("HogeHogeForm.php", "type1") value="Open"></input></td>
//   => HogeHogeForm.php?type=type1
// <td><img src="./TestImage.png"></td>
//   => ./TestImage.png
// <td><a href="./HogeHoge.html">Link</a></td>
//   => ./HogeHoge.html
// <td>hello</td>
//   => hello
void ParseTD(TidyDoc doc, td_type &td){
	TidyNode td_node = td.first;
	int depth = 1;
	bool skip_child = false;
	for(TidyNode cur = tidyGetChild(td_node); depth > 0 && cur; cur = DepthFirstNext(cur, skip_child, depth)){
		skip_child = false;
		ctmbstr name = tidyNodeGetName(cur);
		if (name){
			// e[uqɂȂĂꍇɂxc[\͂Ă܂Ȃ悤AXLbvB
			if (std::strcmp(name, "table") == 0){
				skip_child = true;
				continue;
			}
			for (TidyAttr attr = tidyAttrFirst(cur); attr; attr = tidyAttrNext(attr)){
				const char *attrname = tidyAttrName(attr);
				// ****************************
				// *** 󋵂ɉď ***
				if (std::strcmp(attrname, "onclick") == 0){
					ctmbstr value = tidyAttrValue(attr);
					if (value){
						TidyBuffer &tb = td.second;
						const char *p = value;

						p = extract_quoted_string(p, tb);

						append_string(td.second, "?type=");
						p = extract_quoted_string(p, tb);
					}
				}else if (std::strcmp(attrname, "href") == 0 || std::strcmp(attrname, "src") == 0){
					ctmbstr value = tidyAttrValue(attr);
					if (value) append_string(td.second, value);
				}else continue;
				// ****************************
			}
		}else{
			// ****************************
			// *** 󋵂ɉď ***
			// href ŏ𒊏oς̏ꍇ͎qtextm[h͕̏svȂ߁Ȁ񂪂ȂƂ̂݃eLXgm[h̓eo
			if (td.second.size == 0){
				tidyNodeGetText(doc, cur, &td.second);
				remove_cr(td.second);
			}
			// ****************************
		}
	}
}

// body^O̓^P[Xł
// tidyParse Ńc[\擾ł悤ɂ邽߂̃IvV
#define ADD_HTMLBODYTAG

int main(int argc, char *argv[]){
	// **************
	// [(1)] 1߂̃R}hCŎhtmlt@CǂݍށB *****
	// **************
	// htmlt@CI[v
	if (argc < 2) return 0;
	FILE *fp = std::fopen(argv[1], "r");
	std::fseek(fp, 0, SEEK_END);
	size_t sz = std::ftell(fp);
	std::rewind(fp);

	// **************
	// [(2)] ADD_HTMLBODYTAG `ĂƂ́A ǂݍ񂾃t@Ce <html>^OA<body>^OŊB
	// **************
	size_t header = 0;
	TidyBuffer html;
	tidyBufInit(&html);
#ifdef ADD_HTMLBODYTAG
	append_string(html, "<html><body>");
	header = html.size-1;
#endif
	tidyBufCheckAlloc(&html, header + sz, 0);
	size_t sz_r = std::fread(html.bp + header, 1, sz, fp);
	html.size += sz_r;
	std::fclose(fp);
#ifdef ADD_HTMLBODYTAG
	append_string(html, "</body></html>");
#else
	tidyBufPutByte(&html, '\0');
#endif

	// **************
	// [(3)] html tidy Ńp[XB
	// **************
	TidyDoc doc = tidyCreate();
	tidySetCharEncoding(doc, "shiftjis");
	tidyOptSetBool(doc, TidyShowInfo, no);
	tidyOptSetBool(doc, TidyShowWarnings, no);
#ifdef NDEBUG
	tidyOptSetBool(doc, TidyShowErrors, no);
#endif
	tidyParseString(doc, reinterpret_cast<ctmbstr>(html.bp));
//	tidyRunDiagnostics(doc);

	std::vector<table_type> tables;

	std::vector<std::pair<size_t, int> > table_stack; // first: tablenm[hA second: m[hX^bN̐[

	// **************
	// [(4)] p[Xm[h[DTA table^OƂ tablei[z(etdm[hAт璊oi[邽߂2z)𐶐B
	// **************
	int depth = 0;
	for(TidyNode cur = tidyGetRoot(doc); cur; cur = DepthFirstNext(cur, false, depth)){
		while(table_stack.size() && table_stack.back().second >= depth){
			table_stack.resize(table_stack.size()-1);
		}
		ctmbstr name = tidyNodeGetName(cur);
		if (name){
			if (std::strcmp(name, "table") == 0){
				table_stack.push_back(std::make_pair(tables.size(), depth));
				tables.push_back(table_type());

			}else if (table_stack.size()){
				table_type &cur_table = tables[table_stack.back().first];
				if (std::strcmp(name, "tr") == 0 || std::strcmp(name, "th") == 0){
					cur_table.push_back(tr_type());
				}else if(std::strcmp(name, "td") == 0){
					cur_table.back().push_back(std::make_pair(cur, TidyBuffer()));
					tidyBufInit(&cur_table.back().back().second);
				}
			}
		}
	}

	// **************
	// [(5)] 擾e td m[hAParseTD ֐ŏ𒊏oB
	// **************
	for (size_t k = 0; k < tables.size(); ++k){
		table_type &table = tables[k];
		std::printf("[table:%u]\n", k);
		for (size_t j = 0; j < table.size(); ++j){
			tr_type &tr = table[j];
			for (size_t i = 0; i < tr.size(); ++i){
				td_type &td = tr[i];
				ParseTD(doc, td);
				std::printf("%s", td.second.bp);
				if (i < tr.size() -1) std::printf(",");
			}
			std::printf("\n");
		}
	}

	tidyRelease(doc);

	return 0;
}
