#!/usr/bin/env python
# -*- coding: utf-8 -*-
#----------------------------------------------------------------------
# Copyright (c) 2001 New Information Paradigms Ltd
#
# This Software is released under the MIT License:
# http://www.opensource.org/licenses/mit-license.html
#
#----------------------------------------------------------------------
import sys, os , codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
if os.path.dirname(__file__) == '':
    sys.path.append('..')
else:
    sys.path.append(os.path.dirname(__file__)+'/..')
from webmonitor import *
#
import BaseHTTPServer, SimpleHTTPServer , CGIHTTPServer
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO
import urlparse


headers = ''
proxies={}

import getopt

def usage():
    """

    Help message.
    """
    print 'Usage: wmserver.py [OPTIONS]'
    print '  Monitor script generator as working HTTP proxy.'
    print 'Options: '
    print '  -h,--help   : This message.'
    print '  -p,--proxy  : Specified proxy. ( ex > -p http://foobar.com -p https://barbaz.com'
# Option parser
try:
    opts,args = getopt.getopt(sys.argv[1:],'hp:v',['help','proxy='])
except getopt.GetoptError:
    usage()
    sys.exit(1)
for opt,arg in opts:
    if opt in ('-h','--help'):
        usage()
        sys.exit(0)
    if opt in ('-p','-proxy'):
        if re.compile('^http://').match(arg):
            proxies['http'] = arg
        elif re.compile('^https://').match(arg):
            proxies['https'] = arg
        else:
            usage()
            sys.exit(2)

def html_escape ( html ):
    """

    Html escape character
    """
    html = re.sub('&','&amp;',html)
    html = re.sub('\'','&#039;',html)
    html = re.sub('"','&quot;',html)
    html = re.sub('<','&lt;',html)
    html = re.sub('>','&gt;',html)
    return html

def urldecode(query):
    """

    Array parameter to url parameter
    """
    d = {}
    a = query.split('&')
    for s in a:
        if s.find('=') >= 0:
            k,v = map(urllib.unquote_plus, s.split('='))
            d[k] = v
    return d

def escape_quoted_string(input):
    """

    String to python code.
    """
    input =  re.sub(re.compile('\r',re.M),'\\\\r',input)
    input =  re.sub(re.compile('\n',re.M),'\\\\n',input)
    input =  re.sub('\'','\\\\\'',input)
    return input

def join_url(urischeme,urihost,uripath,uricomment,uriquery,urifrag):
    """

    Url generator
    """
    url = ''
    if urischeme != '':
        url = urischeme + '://' + urihost
    if uripath != '':
        uripath = re.sub('\./','',uripath)
        uripath = re.sub('//','/',uripath)
        url = url + uripath
    if uricomment != '':
        url = url + ';' + uricomment
    if uriquery != '':
        url = url + '?' + uriquery
    if urifrag != '':
        url = url + '#' + urifrag
    return url
#------------------
# Gloval variables
#------------------
# <A> tags
atag_mapping = {}
# <form> tags
form_mapping = {}
# Monitor case instance
case = None
# Page titles
titles = []
#------------------
# Proxy ( HTTP Server request handler )
#------------------
class MyHandler(CGIHTTPServer.CGIHTTPRequestHandler):
    """

    HTTP Server request handler class
    """
    def log_message(self, format, *args):
        """

        Override from BaseHTTPRequestHandler
        """
        if get_debuglv() > 1:
            CGIHTTPServer.CGIHTTPRequestHandler.log_message(self,format,*args)
    
    def __get_join_basepath(self,path):
        """

        Url converter from relative to absolute
        """
        if self.base != '':
            debug_out(5, '<base> tag falsification : ' + path + ' => ' + eval_relative_url(self.base,path))
            return eval_relative_url(self.base,path)
        return path
                    
    def __get_fake_link(self,real_link):
        """

        Fakes link path. ( /path-to => /<host>/path-to )
        """
        urischeme,urihost,uripath,uricomment,uriquery,urifrag = urlparse.urlparse(real_link)
        if urihost == '':
            urihost = '-'
        urihost = urllib.quote(urihost)
        return join_url('','','/' + urihost + '/' + eval_relative_path(self.path,uripath),uricomment,uriquery,urifrag)

    def __base_fake(self,doc):
        """

        Fakes <base> tag or supplements <base> tag have is going to indicate original URL. )
        """
        nodes = find('base',None,None,doc)
        self.base = ''
        if len(nodes) > 0:
            # Base tag found
            attr = getAttributeNode(nodes[0],'href')
            if attr != None:
                self.base = attr.nodeValue
                debug_out(4,'Find base : ' + self.base)
        else:
            # Base tag not found
            head_nodes = find('head',None,None,doc)
            if len(head_nodes) > 0:
                # Prepend base tag node.
                debug_out(4,'Prepend base : ' + self.base)
                base_node = doc.createElement('base')
                base_node.setAttribute('href',self.request_url)
                base_node.setAttribute('target','_top')
                prependChild(head_nodes[0],base_node)
                
    def __atag_fake(self,doc):
        """

        Fakes <A> tag and mapping for manage URL original and faked.
        """
        global atag_mapping
        atag_mapping = {}
        atag_txt_mapping = {}
        nodes = find('a',None,None,doc)
        for node in nodes:
            real_link = ''
            fals_link = ''
            txt = getNodeText(node)
            attr = getAttributeNode(node,'href')
            if attr != None:
                # Including query-string.
                real_link = attr.nodeValue
                fals_link = self.__get_fake_link(real_link)
                attr.nodeValue = 'http://' + self.headers.getheader('host') + fals_link
            if atag_txt_mapping.has_key(txt) == False:
                atag_txt_mapping[txt] = 0
            real_link = self.__get_join_basepath(real_link)
            atag_mapping[fals_link] = (real_link,txt,atag_txt_mapping[txt])
            atag_txt_mapping[txt] = atag_txt_mapping[txt] + 1
            debug_out(5,'A tag : ' + real_link + ' => ' + fals_link)
        
    def __form_fake(self,doc):
        """

        Fakes <base> tag and mapping for manage URL original and faked.
        """
        global form_mapping
        form_mapping = {}
        form_no = 0
        nodes = find('form',None,None,doc)
        for node in nodes:
            real_link = ''
            fals_link = ''
            query_string=''
            attr = getAttributeNode(node,'action')
            if attr != None:
                # Not including query-string.
                urischeme,urihost,uripath,uricomment,uriquery,urifrag = urlparse.urlparse(attr.nodeValue)
                real_link = join_url(urischeme,urihost,uripath,'','','')
                query_string    = join_url('','','',uricomment,uriquery,urifrag)
                fals_link = self.__get_fake_link(real_link)
                attr.nodeValue = 'http://' + self.headers.getheader('host') + fals_link
            real_link = self.__get_join_basepath(real_link)
            form_mapping[fals_link] = (real_link,query_string,form_no)
            form_no = form_no + 1
            debug_out(5,'Form tag : ' + real_link + ' => ' + fals_link)

    def __charset_fake(self,doc):
        """

        Fakes <meta> tag charset=utf-8.
        """
        charset,node = get_charset(doc)
        if node != None:
            node.nodeValue = 'text/html; charset=utf-8'
        
    def __next_step(self,step):
        """

        Goto next page.
        """
        case.add_step(step)
        case.execute_tail()
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
        doc = parser.parse(case.html)
        # Get title
        title = get_title(doc)
        if title == None:
            title = 'None title'
        titles.append(title)
        # Current url 
        self.request_url = case.req.get_full_url()
        # Base fake
        self.__base_fake(doc)
        # A-href fake
        self.__atag_fake(doc)
        # Form fake
        self.__form_fake(doc)
        # Charset fake
        self.__charset_fake(doc)
        html = gen_html(doc)
        # fource utf-8
        html = to_string(html)
        f = StringIO()
        f.write(html)
        length = f.tell()
        f.seek(0)
        self.send_response(200)
        self.send_header('Content-type','text/html')
        self.send_header('Content-length',str(length))
        self.end_headers()
        self.copyfile(f, self.wfile)
        f.close()
        
    def do_GET(self):
        """

        GET method handler.
        """
        try:
            global case,headers,atag_mapping,form_mapping,titles
            debug_out (1,'*********** GET ****************')
            debug_out (1,'PATH : ' + self.path)
            urischeme,urihost,uripath,uricomment,uriquery,urifrag = urlparse.urlparse(self.path)
            queries=urldecode(uriquery)
            if uripath == '/':
                # Root path.
                if queries.has_key('do'):
                    if queries['do'] == 'start':
                        # Case generator start.
                        headers = []
                        for header in urllib.unquote_plus(queries['headers']).splitlines():
                            p,v = header.split(': ')
                            headers.append((p,v))
                        case = WebMonitorCase(WebMonitorHTTPWrapper(headers,proxies))
                        case.reset()
                        titles = []
                        self.__next_step(WebMonitorStepUrl(queries['url']))
                    else:
                        # Case generator end. ( Generate python source )
                        str_script = ''
                        str_script = str_script +  '#!/usr/bin/env python' + '\n'
                        str_script = str_script +  '# -*- coding: utf-8 -*-' + '\n'
                        str_script = str_script +  'import sys , os , codecs' + '\n'
                        str_script = str_script +  'sys.stdout = codecs.getwriter(\'utf-8\')(sys.stdout)' + '\n'
                        str_script = str_script +  'sys.stderr = codecs.getwriter(\'utf-8\')(sys.stderr)' + '\n'
                        str_script = str_script +  'import webmonitor' + '\n'
                        str_script = str_script +  'webmonitor.set_debuglv(1)' + '\n'
                        str_script = str_script +  'headers = ' + str(headers) + '\n'
                        str_script = str_script +  'proxies = ' + str(proxies) + '\n'
                        str_script = str_script +  'client = webmonitor.WebMonitorHTTPWrapper(headers,proxies)' + '\n'
                        str_script = str_script +  'case = webmonitor.WebMonitorCase(client)' + '\n'
                        i = 0
                        for step in case.steps:
                            str_script = str_script +  '# ' + str(i) + ' --------- ' + re.sub(re.compile('(?:\r)?\n',re.M),' ',titles[i]) + ' ------------' + '\n'
                            if isinstance(step,WebMonitorStepUrl):
                                str_script = str_script +  'case.add_step(webmonitor.WebMonitorStepUrl(\'' + to_unicode(step.url) + '\'),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\')])' + '\n'
                                str_script = str_script +  '#case.add_step(webmonitor.WebMonitorStepUrl(\'' + to_unicode(step.url) + '\'),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\'),webmonitor.WebMonitorValidatorImg(client),webmonitor.WebMonitorValidatorLink(client),webmonitor.WebMonitorValidatorScript(client),webmonitor.WebMonitorValidatorATag(client)])' + '\n'
                            elif isinstance(step,WebMonitorStepATag):
                                str_script = str_script +  'case.add_step(webmonitor.WebMonitorStepATag(u\'' + escape_quoted_string(to_unicode(step.txt)) + '\',' + str(step.no) + '),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\')])' + '\n'
                                str_script = str_script +  '#case.add_step(webmonitor.WebMonitorStepATag(u\'' + escape_quoted_string(to_unicode(step.txt)) + '\',' + str(step.no) + '),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\'),webmonitor.WebMonitorValidatorImg(client),webmonitor.WebMonitorValidatorLink(client),webmonitor.WebMonitorValidatorScript(client),webmonitor.WebMonitorValidatorATag(client)])' + '\n'
                            elif isinstance(step,WebMonitorStepForm):
                                str_script = str_script +  'parms = {}' + '\n'
                                if step.data != None:
                                    for p,v in step.data.iteritems():
                                        str_script = str_script +  'parms[u\'' + to_unicode(p) + '\']= u\'' + escape_quoted_string(to_unicode(v)) + '\'' + '\n'
                                str_script = str_script +  'case.add_step(webmonitor.WebMonitorStepForm(parms,'+ str(step.no) +'),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\')])' + '\n'
                                str_script = str_script +  '#case.add_step(webmonitor.WebMonitorStepForm(parms,'+ str(step.no) +'),[webmonitor.WebMonitorValidatorTitle(u\'' + escape_quoted_string(titles[i]) + '\'),webmonitor.WebMonitorValidatorImg(client),webmonitor.WebMonitorValidatorLink(client),webmonitor.WebMonitorValidatorScript(client),webmonitor.WebMonitorValidatorATag(client)])' + '\n'
                            i = i + 1
                        str_script = str_script +  '#----- Exec once -----------------------------------' + '\n'
                        str_script = str_script +  'try:' + '\n'
                        str_script = str_script +  '    case.execute_all()' + '\n'
                        str_script = str_script +  'except webmonitor.WebMonitorError,(ex): ' + '\n'
                        str_script = str_script +  '    print >>sys.stderr, ex.get_message()' + '\n'
                        str_script = str_script +  '#----- Loop exec callback --------------------------' + '\n'
                        str_script = str_script +  'import time' + '\n'
                        str_script = str_script +  'class LoopCallback(webmonitor.WebMonitorLoopCallback):' + '\n'
                        str_script = str_script +  '    def __init__(self,interval,err_interval):' + '\n'
                        str_script = str_script +  '        self.interval = interval' + '\n'
                        str_script = str_script +  '        self.err_interval = err_interval' + '\n'
                        str_script = str_script +  '    def success(self):' + '\n'
                        str_script = str_script +  '        print time.strftime(\'%Y/%m/%d-%H:%M:%S Loop is aliving !\', time.localtime(time.time()))' + '\n'
                        str_script = str_script +  '        time.sleep(self.interval)' + '\n'
                        str_script = str_script +  '    def error(self,ex):' + '\n'
                        str_script = str_script +  '        if isinstance(ex,webmonitor.WebMonitorValidatorTitleError):' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        elif isinstance(ex,webmonitor.WebMonitorValidatorImgError):' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        elif isinstance(ex,webmonitor.WebMonitorValidatorLinkError):' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        elif isinstance(ex,webmonitor.WebMonitorValidatorScriptError):' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        elif isinstance(ex,webmonitor.WebMonitorValidatorATagError):' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        else:' + '\n'
                        str_script = str_script +  '            pass' + '\n'
                        str_script = str_script +  '        print time.strftime(\'%Y/%m/%d-%H:%M:%S Loop is died ! : \', time.localtime(time.time())) + ex.get_message()' + '\n'
                        str_script = str_script +  '        time.sleep(self.err_interval)' + '\n'
                        str_script = str_script +  '    def fatal(self):' + '\n'
                        str_script = str_script +  '        print time.strftime(\'%Y/%m/%d-%H:%M:%S Script is died ! (UNKNOWN) \', time.localtime(time.time()))' + '\n'
                        str_script = str_script +  '#----- Loop exec start---- --------------------------' + '\n'
                        str_script = str_script +  '# webmonitor.web_monitor_loop(case,LoopCallback(600,3600))' + '\n'
                        # output to stdout
                        print str_script
                        # output to browser
                        f = StringIO()
                        f.write('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"><html><title>Generated script</title><body><pre>'+to_string(str_script)+'</pre></body></html>')
                        length = f.tell()
                        f.seek(0)
                        self.send_response(200)
                        self.send_header('Content-type','text/html')
                        self.send_header('Content-length',str(length))
                        self.end_headers()
                        self.copyfile(f, self.wfile)
                        f.close()
                else:
                    # Initial page.
                    headers_out = ''
                    for hn in ('User-agent','Accept','Accept-Language','Accept-Charset'):
                        hv = self.headers.getheader(hn)
                        if hv != None:
                            headers_out = headers_out + hn + ': ' + hv + '\n'
                    headers_out = headers_out + 'Keep-Alive: closed' + '\n'
                    headers_out = html_escape(headers_out)
                    f = StringIO()
                    f.write('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"><html><head><title>Entry page</title></head><body><a href="/?do=finish">gen-script-link</a><br><form method="GET">First URL : <br><input type="submit" name="do" value="start"></input><input name="url" type="text" size="80"></input><br>Headers : <br><textarea name="headers" cols="80" rows="7">'+headers_out+'</textarea><br></form></body></html>')
                    length = f.tell()
                    f.seek(0)
                    self.send_response(200)
                    self.send_header('Content-type','text/html')
                    self.send_header('Content-length',str(length))
                    self.end_headers()
                    self.copyfile(f, self.wfile)
                    f.close()
            else:
                # Proxy mode.
                if atag_mapping.has_key(self.path):
                    # <A> tag click.
                    url,txt,no = atag_mapping[self.path]
                    debug_out (4, 'A tag match : ' + self.path + ' => ' + to_unicode(txt) + ' ' + str(no) + ' => ' + to_unicode(url))
                    self.__next_step(WebMonitorStepATag(txt,no))
                elif form_mapping.has_key(uripath):
                    # Submit <form> tag.
                    url,q,no = form_mapping[uripath]
                    debug_out (4, 'Form tag match : ' + self.path + ' => ' + q + ' ' + str(no) + ' => ' + to_unicode(url))
                    self.__next_step(WebMonitorStepForm(queries,no))
                else:
                    # Some error...
                    debug_out (1, 'Not found ...')
                    self.send_response(404)
                    self.end_headers()
        except WebMonitorError,(ex):
            print >>sys.stderr, ex.get_message()
    def do_POST(self):
        """ 

        GET method handler.
        """
        try:
            # Proxy mode.
            debug_out (1,'*********** POST ****************')
            debug_out (1,'PATH : ' + self.path)
            global form_mapping
            urischeme,urihost,uripath,uricomment,uriquery,urifrag = urlparse.urlparse(self.path)
            length = self.headers.getheader('content-length')
            q = self.rfile.read(int(length))
            debug_out (3,'Read : ' + to_unicode(q))
            queries=urldecode(q)
            debug_out (3,'Unquoted-decoded : ' + to_unicode(queries))
            urischeme,urihost,uripath,uricomment,uriquery,urifrag = urlparse.urlparse(self.path)
            if form_mapping.has_key(uripath):
                # Submit <form> tag.
                url,q,no = form_mapping[uripath]
                debug_out (4, 'Form tag match : ' + self.path + ' => ' + q + ' ' + str(no) + ' => ' + to_unicode(url))
                self.__next_step(WebMonitorStepForm(queries,no))
            else:
                debug_out (1, 'Not found ...')
                self.send_response(404)
                self.end_headers()
        except WebMonitorError,(ex):
            print >>sys.stderr, ex.get_message()

# Start server
httpd = BaseHTTPServer.HTTPServer(('', 8899),MyHandler)
sa = httpd.socket.getsockname()
print >>sys.stderr,"Serving HTTP on", sa[0], "port", sa[1], "..."

set_debuglv(0)
httpd.serve_forever()

# wm_server.py -p https://localhost:20080 -p http://localhost:20080
