﻿// html 
import dom;
private import std.string;
	
class DefTag 
{
	char[] start;
	char[] end;
	bool hasName;
	bool ignore;
	bool hasAttr;
	bool hasChild;
	bool child;
	bool script;
	Node[] node;
	DefTag endTag;
	bool isEndTag;
	enum{
		HASNAME =0x01,	// タグに名前がある
		HASATTR =0x02,	// タグにアトリビュートがある
		IGNORE  =0x04,	// タグの名前は大文字、小文字を無視する
		HASCHILD=0x08,	// タグ内にタグを内包する
		CHILD   =0x10,	// タグ内にタグを内包される
		SCRIPT  =0x20,	// 開始タグ後の文字列は、JavaScriptとして処理する。
						// 文字列内のコメントタグ内の文字列中は</scriptを探さない。
	};
	this(char[]start,char[]end,int flag){
		this(start,end,flag,null);
	}
	this(char[]start,char[]end,int flag,DefTag endTag){
		this.start = start;
		this.end = end;
		if((flag&HASNAME )!=0)hasName  = true; else hasName  = false;
		if((flag&IGNORE  )!=0)ignore   = true; else ignore   = false;
		if((flag&HASATTR )!=0)hasAttr  = true; else hasAttr  = false;
		if((flag&HASCHILD)!=0)hasChild = true; else hasChild = false;
		if((flag&CHILD   )!=0)child    = true; else child    = false;
		if((flag&SCRIPT  )!=0)script   = true; else script   = false;
		this.endTag = endTag;
		this.isEndTag=false;
		if(endTag!==null)endTag.isEndTag=true;
	}
}
 
class Node 
{
	enum{TEXTNODE,TAGNODE};
}
 
class TextNode:Node 
{
	this(){}
	char[] text;
	char[] toString(){return text;}
}
 
class Attr 
{
	char[] space0;
	char[] name;
	char[] space1;
	char[] value;
	char[] space2;
	this(){}
	this(char[] name, char[] value){
		space0=" ";
		this.name=name;
		space1="=\"";
		this.value=value;
		space2="\"";
	}
	Attr copy(Attr attr){
		space0 = attr.space0;
		name = attr.name;
		space1 = attr.space1;
		value = attr.value;
		space2 = attr.space2;
		return this;
	}
	char[] toString()
	{
		return space0~name~space1~value~space2;
	}
}
 
class TagNode:Node 
{
	char[] start;
	char[] name;
	Attr[] attr;
	char[] endspace;
	char[] end;
	DefTag deftag;
	Attr[char[]] attrHash;

	this(){}
	char[] toString()
	{
		char[] str = start~name;
		for(int i=0;i<attr.length;i++){
			str ~= attr[i].toString();
		}
		str ~= endspace;
		str ~= end;
		return str;
	}
	char[] getAttr(char[] name){
		if(attrHash.length==0){
			Attr[char[]] a;
			attrHash=a;
			for(int i=0;i<attr.length;i++){
				attrHash[attr[i].name]=attr[i];
			}
		}
		Attr attr = attrHash[name];
		if(attr!==null)return attr.value;
		else return "";
	}
	void setAttr(Attr a){
		Attr attr = attrHash[a.name];
		if(attr===null){
			attr = new Attr(a.name,a.value);
			attr.copy(a);
			attrHash[a.name]=attr;
			this.attr~=attr;
		}else{
			attr.copy(a);
			attrHash[a.name]=attr;
		}
	}

	void setAttr(char[] name,char[] value){
		Attr attr = attrHash[name];
		if(attr===null){
			attr = new Attr(name,value);
			attrHash[name]=attr;
			this.attr~=attr;
		}else{
			attr.value=value;
		}
	}
	void removeAttr(char[] name){
		Attr attr = attrHash[name];
		if(attr!==null){
			delete attrHash[name];
			Attr array[];
			for(int i=0;i<this.attr.length;i++){
				if(attr!=this.attr[i])array ~= this.attr[i];
			}
			this.attr=array;
		}
	}
	TagNode copy(TagNode node){
		start=node.start;
		name=node.name;
		attr=new Attr[0];
		for(int i=0;i<node.attr.length;i++){
			attr~=node.attr[i];
		}
		endspace=node.endspace;
		end=node.end;
		deftag=node.deftag;
		attrHash=node.attrHash;
		return this;
	}
	// +--start
	// |+--name
	// ||   +--attr[0].space0
	// ||   |+-attr[0].name
	// ||   ||+attr[0].space1
	// ||   ||| +attr[0].value
	// ||   ||| |   +attr[0].space2
	// ||   ||| |   |           +--end
	// ||   ||| |   |           |
	// <name a="data"  b="data" >
	//               | || |   ||
	//               | || |   |+--endspace
	//               | || |   +attr[1].space2
	//               | || +attr[1].value
	//               | |+attr[1].space1
	//               | +-attr[1].name
	//               +-attr[1].space0
	Dom toDom()
	{
		Dom tagnode= new Dom(Dom.BLOCK,"tag");
		tagnode.setAttr("start",start);
		tagnode.setAttr("name",name);
		tagnode.setAttr("end",end);
		tagnode.setAttr("endspace",endspace);
		for(int i=0;i<attr.length;i++){
			tagnode.add(newDom(Dom.VAR,attr[i].name)
				.setAttr("space0",attr[i].space0)
				.setAttr("space1",attr[i].space1)
				.setAttr("value",attr[i].value)
				.setAttr("space2",attr[i].space2));
		}
		return tagnode;
	}
}
 
TagNode newTagNode(Dom dom) 
{
	if(dom.type == Dom.BLOCK){
		TagNode ddom = new TagNode();
		ddom.start=dom.getAttr("start");
		ddom.name=dom.getAttr("name");
		ddom.end=dom.getAttr("end");
		ddom.endspace=dom.getAttr("endspace");
		for(int i=0;i<dom.array.length;i++){
			Attr attr = new Attr();
			Dom d=dom.array[i];
			if(dom.array[i].type==Dom.VAR){
				attr.space0=d.attr["space0"];
				attr.name=d.value;
				attr.space1=d.attr["space1"];
				attr.value=d.attr["value"];
				attr.space2=d.attr["space2"];
			}
			ddom.setAttr(attr);
		}
		return ddom;
	}
	return null;
}
 
class HtmlParser 
{
	
	Node[] parse(char[] str,bool hasChild,DefTag[] defTags){ 
		Node[] nodes;
		TextNode textnode=new TextNode();
		TagNode tagnode = null;
		Attr attr = null;
		char c=0;
		int state=TEXT;
		DefTag tag;
		int len;
		while(pos<str.length){
			c=str[pos];
			state=TEXT;
			switch(state){
			case TEXT:
				// Tag開始判定
				tagnode = getShortNode(str,hasChild,defTags);
				if(tagnode!==null){
					if(textnode.text.length!=0)
						nodes ~= textnode;
					nodes ~= tagnode;
					if(tagnode.deftag.isEndTag){
						return nodes;
					}
					if(tagnode.deftag.endTag!==null){
						//endTagが見つかるまで、textとして扱う。
						//ただし、子になることができるタグはパースする。
						DefTag[] dt;
						for(int i=0;i<defTags.length;i++){
							if(defTags[i].child)dt ~= defTags[i];
						}
						dt ~= tagnode.deftag.endTag;
						if(tagnode.deftag.script){
							dt ~= new DefTag("<!--","-->",DefTag.HASCHILD);
						}
						Node[] n = parse(str,false,dt);
						nodes = nodes~n;
					}
					textnode = new TextNode();
					break;
				}
				textnode.text ~= c;
				pos++;
				break;
			}
		}
		if(textnode.text.length!=0)nodes~=textnode;
		return nodes;
	}
 
	DefTag dts[]; 
	enum{
		TEXT,
		TAG_START,
		TAG_NAME,
		ATTR_SPACE0,
		ATTR_NAME,
		ATTR_SPACE1_SPACE1,ATTR_SPACE1_EQUAL,ATTR_SPACE1_SPACE2,
		ATTR_SPACE1_QUOT,ATTR_VALUE,ATTR_SPACE2_QUOT,
		ATTR_VALUE2,
	};
	int pos;
 
	this(){ 
		dts ~= new DefTag("<plaintext",">",DefTag.IGNORE|DefTag.HASATTR|DefTag.HASCHILD,
				   new DefTag("","",0));
		dts ~= new DefTag("<textarea",">",DefTag.IGNORE|DefTag.HASATTR|DefTag.HASCHILD,
				   new DefTag("</textarea",">",DefTag.IGNORE|DefTag.HASATTR|DefTag.HASCHILD));
		dts ~= new DefTag("<script",">",DefTag.IGNORE|DefTag.HASATTR|DefTag.HASCHILD|DefTag.SCRIPT,
				   new DefTag("</script",">",DefTag.IGNORE|DefTag.HASATTR|DefTag.HASCHILD));
		dts ~= new DefTag("<!--","-->",DefTag.HASCHILD);
		dts ~= new DefTag("</",">",DefTag.HASNAME|DefTag.HASATTR|DefTag.HASCHILD);
		dts ~= new DefTag("<","/>",DefTag.HASNAME|DefTag.HASATTR|DefTag.HASCHILD);
		dts ~= new DefTag("<",">",DefTag.HASNAME|DefTag.HASATTR|DefTag.HASCHILD);
	}
 
	this(DefTag defTags[]){ 
		this.dts = defTags;
	}
 
	Node[] parse(char[] str){ 
		pos=0;
		return parse(str,false,dts);
	}
 
	private TagNode getShortNode(char[] str,bool hasChild,DefTag[] defTags){ 

		int startpos=pos;
		DefTag tag;
		TagNode tagnode;
		int endpos=int.max;
		long startlen=-1;
		TagNode rc;
		for(int i=0;i<defTags.length;i++){
			tag = defTags[i];
			if(startlen>tag.start.length)continue;
			if(hasChild && tag.child==false)continue;
			tagnode = tagParse(str,tag,defTags);
			if(tagnode!==null && tagnode.end!=""){
				startlen = tagnode.deftag.start.length;
				if(endpos > pos){
					rc=tagnode;
					endpos = pos;
				}
			}
			pos = startpos;
		}
		if(rc!==null)pos=endpos;
		return rc;
	}
 
	private TagNode tagParse(char[] str,DefTag tag,DefTag[] defTags) 
	{
		TagNode tagnode = new TagNode();
		tagnode.deftag=tag;
		Attr attr = null;
		int state=TAG_START;
		int len;
		while(pos<str.length){
			char c=str[pos];
			switch(state){
			case TAG_START:
				// 開始タグ
				if(checkString(str,pos,tag.start,tag.ignore)){
					tagnode.start=tag.start;
					if(tag.hasName){
						state=TAG_NAME;
					}else
					if(tag.hasAttr){
						state=ATTR_SPACE0;
						attr = new Attr();
					}else{
						state=TAG_NAME;
					}
					pos+=tag.start.length;
					break;
				}
				return null;
				break;
			case TAG_NAME:
				if(tag.hasAttr){
					if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
						if(tag.hasName && tagnode.name=="")return null;//タグ名が必要なのにない
						state = ATTR_SPACE0;
						attr = new Attr();
						break;
					}
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					if(tag.hasAttr && tagnode.name=="")return null;//タグ名が必要なのにない
					tagnode.end=tag.end;
					//state=TEXT;
					pos+=tag.end.length;
					return tagnode;
				}
				if(tag.hasChild){
					TagNode child=getShortNode(str,true,defTags);
					if(child!==null){
						tagnode.name ~= child.toString();
						break;
					}
				}
				tagnode.name ~= c;
				pos++;
				break;
			case ATTR_SPACE0:
				if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
					attr.space0 ~= c;
					pos++;
					break;
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.endspace=attr.space0;
					tagnode.end=tag.end;
					pos+=tag.end.length;
					return tagnode;
				}
				state = ATTR_NAME;
				tagnode.attr ~= attr;
				break;
			case ATTR_NAME:
				if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
					state = ATTR_SPACE1_SPACE1;
					break;
				}
				if(c=='='){
					state = ATTR_SPACE1_EQUAL;
					break;
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.end = tag.end;
					pos+=tag.end.length;
					return tagnode;
				}

				if(tag.hasChild){
					TagNode child=getShortNode(str,true,defTags);
					if(child!==null){
						attr.name ~= child.toString();
						break;
					}
				}
				attr.name ~= c;
				pos++;
				break;
			case ATTR_SPACE1_SPACE1:
				if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
					if(tag.hasChild){
						TagNode child=getShortNode(str,true,defTags);
						if(child!==null){
							attr.space1 ~= child.toString();
							break;
						}
					}
					attr.space1 ~= c;
					pos++;
					break;
				}
				if(c == '='){
					state = ATTR_SPACE1_EQUAL;
					break;
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.end = tag.end;
					pos+=tag.end.length;
					return tagnode;
				}
				char[] tmp=attr.space1;
				attr.space1="";
				attr = new Attr();
				attr.space0=tmp;
				state=ATTR_SPACE0;
				break;
			case ATTR_SPACE1_EQUAL:
				attr.space1 ~= c;
				pos++;
				state=ATTR_SPACE1_SPACE2;
				break;
			case ATTR_SPACE1_SPACE2:
				if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
					if(tag.hasChild){
						TagNode child=getShortNode(str,true,defTags);
						if(child!==null){
							attr.space1 ~= child.toString();
							break;
						}
					}
					attr.space1 ~= c;
					pos++;
					break;
				}
				if(c=='\"'){
					state=ATTR_SPACE1_QUOT;
					break;
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.end=tag.end;
					pos+=tag.end.length;
					return tagnode;
				}
				state = ATTR_VALUE2;
				break;
			case ATTR_SPACE1_QUOT:
				attr.space1 ~= c;
				pos++;
				state=ATTR_VALUE;
				break;
			case ATTR_VALUE:
				if(c=='\"'){
					state=ATTR_SPACE2_QUOT;
					break;
				}
				if(tag.hasChild){
					TagNode child=getShortNode(str,true,defTags);
					if(child!==null){
						attr.value ~= child.toString();
						break;
					}
				}
				attr.value ~= c;
				pos++;
				break;
			case ATTR_SPACE2_QUOT:
				attr.space2 ~= c;
				pos++;
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.end=tag.end;
					pos+=tag.end.length;
					return tagnode;
				}
				attr = new Attr();
				state=ATTR_SPACE0;
				break;
			case ATTR_VALUE2://"で囲まれない値の場合
				if(c==' '|| c=='\r' || c=='\n' || c=='\t'){
					attr = new Attr();
					state=ATTR_SPACE0;
					break;
				}
				// 終了タグ
				if(checkString(str,pos,tag.end,tag.ignore)){
					tagnode.end=tag.end;
					pos+=tag.end.length;
					return tagnode;
				}
				if(tag.hasChild){
					TagNode child=getShortNode(str,true,defTags);
					if(child!==null){
						attr.value ~= child.toString();
						break;
					}
				}
				attr.value ~= c;
				pos++;
				break;
			}
		}
		if(tag.hasAttr && tagnode.name=="")return null;//タグ名が必要なのにない
		return tagnode;
	}
 
	private bool checkString(char[]str,int pos,char[] str2,bool ignore){ 
		int len=str2.length;
		if(len==0)return false;
		if(str.length>=pos+len){
			if(ignore){
				if(tolower(str[pos..pos+len])==tolower(str2)){
					return true;
				}
			}else{
				if(str[pos..pos+len]==str2){
					return true;
				}
			}
		}
		return false;
	}
 
} 
  
version(HTML_TEST){ 
	void main()
	{
		HtmlParser parser = new HtmlParser();
		Node[] node = parser.parse("<html></html>");
		for(int i=0;i<node.length;i++){
			printf("%d:%.*s\n",i,node[i].toString());
		}
	}
}
  
