# -*- coding: UTF-8 -*-
#
#  script.py - a Sakura Script parser
#  Copyright (C) 2001, 2002 by Tamito KAJIYAMA
#  Copyright (C) 2004 by Shyouzou Sugitani <shy@debian.or.jp>
#
#  This program is free software; you can redistribute it and/or modify it
#  under the terms of the GNU General Public License (version 2) as
#  published by the Free Software Foundation.  It is distributed in the
#  hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
#  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
#  PURPOSE.  See the GNU General Public License for more details.
#
# $Id: script.py,v 1.1 2004/04/10 12:14:04 atzm Exp $
#

import re
import string
import sys

TOKEN_TAG         = 1
TOKEN_META        = 2
TOKEN_OPENED_SBRA = 3
TOKEN_CLOSED_SBRA = 4
TOKEN_NUMBER      = 5
TOKEN_STRING      = 6

patterns = [
    (TOKEN_TAG, re.compile(r"\\[ehunjcxtqzy*v0123456789fmia!&+---]|"
                           r"\\[sb][0-9]?|\\w[0-9]|\\_[wqslvVbe+cumna]|"
                           r"\\__[ct]|\\URL")),
    (TOKEN_META, re.compile(r"%month|%day|%hour|%minute|%second|%username|"
                            r"%selfname2?|%keroname|%friendname|%songname|"
                            r"%screen(width|height)|%exh|%et|%m[szlchtep?]|"
                            r"%dms|%j|%c")),
    (TOKEN_NUMBER, re.compile(r"[0-9]+")),
    (TOKEN_OPENED_SBRA, re.compile(r"\[")),
    (TOKEN_CLOSED_SBRA, re.compile(r"\]")),
    (TOKEN_STRING, re.compile(r"(\\\\|\\%|\\\]|[^\\\[\]%0-9])+")),
    (TOKEN_STRING, re.compile(r"[%\\]")),
    ]

SCRIPT_TAG  = 1
SCRIPT_TEXT = 2

TEXT_META   = 1
TEXT_STRING = 2

class ParserError(Exception):
    def __init__(self, message, column=None, length=None):
        self.message = message
        self.column = column
        self.length = length
    def __str__(self):
        if self.column is not None:
            column = self.column
        else:
            column = "??"
        return "ParserError: column %s: %s" % (column, self.message)

class Parser:
    def __init__(self, error="strict"):
        if error not in ["strict", "loose"]:
            raise ValueError, "unknown error scheme: %s" % str(error)
        self.error = error
    def tokenize(self, s):
        tokens = []
        pos = 0
        end = len(s)
        while pos < end:
            for token, pattern in patterns:
                match = pattern.match(s, pos)
                if match:
                    break
            else:
                raise RuntimeError, "should not reach here"
            tokens.append((token, s[pos:match.end()]))
            pos = match.end()
        return tokens
    def next_token(self):
        try:
            token, lexeme = self.tokens.pop(0)
        except IndexError:
            raise ParserError("unexpected end of script",
                              self.column + self.length)
        self.column += self.length
        self.length = len(lexeme)
        return token, lexeme
    def parse(self, s):
        if not s: return []
        # tokenize the script
        self.tokens = self.tokenize(s)
        self.column = 0
        self.length = 0
        # parse the sequence of tokens
        script = []
        text = []
        string_chunks = []
        scope = 0
        anchor = None
        while self.tokens:
            token, lexeme = self.next_token()
            if token == TOKEN_STRING and lexeme == "\\":
                if self.error == "strict":
                    raise ParserError("unknown tag", self.column)
                else:
                    sys.stderr.write("Warning: column %d: "
                                     "unknown tag\n" % self.column)
            elif token == TOKEN_STRING and lexeme == "%":
                if self.error == "strict":
                    raise ParserError("unknown meta string", self.column)
                else:
                    sys.stderr.write("Warning: column %d: "
                                     "unknown meta string\n" % self.column)
            if token in [TOKEN_NUMBER, TOKEN_OPENED_SBRA,
                         TOKEN_STRING, TOKEN_CLOSED_SBRA]:
                lexeme = lexeme.replace(r"\\", "\\")
                lexeme = lexeme.replace(r"\%", "%")
                string_chunks.append(lexeme)
                continue
            if string_chunks:
                text.append((TEXT_STRING, string.join(string_chunks, '')))
                string_chunks = []
            if token == TOKEN_META:
                if lexeme == "%j":
                    argument = self.read_sbra_id()
                    text.append((TEXT_META, lexeme, argument))
                else:
                    text.append((TEXT_META, lexeme))
                continue
            if text:
                script.append((SCRIPT_TEXT, tuple(text)))
                text = []
            if lexeme in ["\\a", "\\c", "\\e", "\\t", "\\_e",
                          "\\v", "\\x", "\\y", "\\z", "\\_q",
                          "\\4", "\\5", "\\6", "\\7", "\\_s", 
                          "\\2", "\\*", "\\-", "\\+", "\\_+",
                          "\\_n", "\\_V", "\\__c", "\\__t"]:
                script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ["\\0", "\\h"]:
                script.append((SCRIPT_TAG, lexeme))
                scope = 0
            elif lexeme in ["\\1", "\\u"]:
                script.append((SCRIPT_TAG, lexeme))
                scope = 1
            elif lexeme in ["\\s", "\\b"]:
                argument = self.read_sbra_id()
                script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme[:2] in ["\\s", "\\b", "\\w"]:
                num = lexeme[2]
                if lexeme[:2] == "\\s" and scope == 1:
                    num = str(int(num) + 10)
                script.append((SCRIPT_TAG, lexeme[:2], num))
            elif lexeme in ["\\_w"]:
                argument = self.read_sbra_number()
                script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ["\\i", "\\j", "\\&", "\\_u", "\\_m"]:
                argument = self.read_sbra_id()
                script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ["\\_b", "\\_c", "\\_l", "\\_v", "\\m",
                            "\\3", "\\8", "\\9"]:
                argument = self.read_sbra_text()
                script.append((SCRIPT_TAG, lexeme, argument))
            elif lexeme in ["\\n"]:
                if self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    argument = self.read_sbra_text()
                    script.append((SCRIPT_TAG, lexeme, argument))
                else:
                    script.append((SCRIPT_TAG, lexeme))
            elif lexeme in ["\\URL"]:
                buffer = [self.read_sbra_text()]
                while self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    buffer.append(self.read_sbra_text())
                    buffer.append(self.read_sbra_text())
                script.append((SCRIPT_TAG, lexeme) + tuple(buffer))
            elif lexeme in ["\\!"]:
                args = self.split_params(self.read_sbra_text())
                script.append((SCRIPT_TAG, lexeme) + tuple(args))
            elif lexeme in ["\\q"]:
                if self.tokens and self.tokens[0][0] == TOKEN_OPENED_SBRA:
                    args = self.split_params(self.read_sbra_text())
                    if len(args) != 2:
                        raise ParserError("wrong number of arguments",
                                          self.column, self.length)
                    if len(args[1]) != 1 or len(args[1][0][1]) == 0:
                        raise ParserError("syntax error (expected an ID)",
                                          self.column, self.length)
                    arg1 = args[0]
                    arg2 = args[1][0][1]
                    script.append((SCRIPT_TAG, lexeme, arg1, arg2))
                else:
                    arg1 = self.read_number()
                    arg2 = self.read_sbra_id()
                    arg3 = self.read_sbra_text()
                    script.append((SCRIPT_TAG, lexeme, arg1, arg2, arg3))
            elif lexeme in ["\\_a"]:
                if anchor is None:
                    anchor = self.column
                    script.append((SCRIPT_TAG, lexeme, self.read_sbra_id()))
                else:
                    anchor = None
                    script.append((SCRIPT_TAG, lexeme))
            else:
                raise ParserError("unknown tag (%s)" % lexeme,
                                  self.column, self.length)
        if anchor is not None:
            raise ParserError(r"syntax error (unbalanced \_a tag)", anchor)
        if string_chunks:
            text.append((TEXT_STRING, string.join(string_chunks, '')))
        if text:
            script.append((SCRIPT_TEXT, tuple(text)))
        return script

    def read_number(self):
        token, number = self.next_token()
        if token != TOKEN_NUMBER:
            raise ParserError("syntax error (expected a number)",
                              self.column, self.length)
        return number

    def read_sbra_number(self):
        token, lexeme = self.next_token()
        if token != TOKEN_OPENED_SBRA:
            raise ParserError("syntax error (expected a square bracket)",
                              self.column, self.length)
        token, number = self.next_token()
        if token != TOKEN_NUMBER:
            raise ParserError("syntax error (expected a number)",
                              self.column, self.length)
        token, lexeme = self.next_token()
        if token != TOKEN_CLOSED_SBRA:
            raise ParserError("syntax error (expected a square bracket)",
                              self.column, self.length)
        return number

    def read_sbra_id(self):
        text = self.read_sbra_text()
        if len(text) != 1:
            raise ParserError("syntax error (expected a single ID)",
                              self.column, self.length)
        return text[0][1]

    def read_sbra_text(self):
        token, lexeme = self.next_token()
        if token != TOKEN_OPENED_SBRA:
            raise ParserError("syntax error (expected a square bracket)",
                              self.column, self.length)
        text = []
        string_chunks = []
        while self.tokens:
            token, lexeme = self.next_token()
            if token in [TOKEN_NUMBER, TOKEN_STRING, TOKEN_OPENED_SBRA,
                         TOKEN_TAG]:
                lexeme = lexeme.replace(r"\\", "\\")
                lexeme = lexeme.replace(r"\%", "%")
                lexeme = lexeme.replace(r"\]", "]")
                string_chunks.append(lexeme)
                continue
            if string_chunks:
                text.append((TEXT_STRING, string.join(string_chunks, '')))
                string_chunks = []
            if token == TOKEN_CLOSED_SBRA:
                break
            elif token == TOKEN_META:
                text.append((TEXT_META, lexeme))
            else:
                raise ParserError("syntax error (wrong type of argument)",
                                  self.column, self.length)
        else:
            raise ParserError("unexpected end of script",
                              self.column + self.length)
        return tuple(text)

    re_param = re.compile('("[^"]*"|[^,])*')
    re_quote = re.compile('"([^"]*)"')
    def split_params(self, text):
        params = []
        buffer = []
        for token, lexeme in text:
            i = 0
            j = len(lexeme)
            if token == TEXT_STRING:
                while i < j:
                    match = self.re_param.match(lexeme, i)
                    if not match:
                        break
                    param, n = self.re_quote.subn(
                        lambda m: m.group(1), match.group())
                    if param or not buffer:
                        buffer.append((token, param))
                    params.append(tuple(buffer))
                    buffer = []
                    i = match.end()
                    if i < j:
                        assert lexeme[i] == ","
                        i += 1
            if i < j:
                buffer.append((token, lexeme[i:]))
        if buffer:
            params.append(tuple(buffer))
        return params

# Tests

testcases = [
    # legal cases
    r"\s[4]ちゃんと選んでよう〜っ。\w8\uまあ、ユーザさんも忙しいんやろ‥‥\e",
    r"%selfnameと%keroname\e",
    r"エスケープのテスト \\, \%, [, ], \] どーかな?\e",
    r"\j[http://www.asahi.com]\e",
    r"\j[http://www.asahi.com/[escape\]/\%7Etest]\e",
    r"\j[http://www.asahi.com/%7Etest/]\e",
    r"\h\s[0]%usernameさんは今どんな感じ？\n\n\q0[#temp0][まあまあ]\q1[#temp1][今ひとつ]\z",
    r"\q0[#temp0][今日は%month月%day日だよ]\e",
    r"\q0[#cancel][行かない]\q1[http://www.asahi.com/%7Etest/][行く]\e",
    r"\q[テスト,test]\q[%month月%day日,date]\e",
    r"\q[テスト,http://www.asahi.com/]\e",
    r"\q[テスト,http://www.asahi.com/%7Etest/]\e",
    r"\h\s[0]%j[#temp0]\e",
    r"\URL[http://www.asahi.com/]\e",
    r"\URL[http://www.asahi.com/%7Etest/]\e",
    r"\URL[行かない][http://www.asahi.com/][トップ][http://www.asahi.com/%7Etest/][テスト]\e",
    r"\_s\s5\w44えんいー%c\e",
    r"\h%m?\e",
    r"\URL[http://www.foo.jp/%7Ebar/]",
    r"\b[0]\b[normal]\i[0]\i[eyeblink]",
    r"\c\x\t\_q\*\1\2\4\5\-\+\_+\a\__c\__t\_n",
    r"\_l[0,0]\_v[test.wav]\_V\_c[test]",
    r"\h\s0123\u\s0123\h\s1234\u\s1234",
    r"\s[-1]\b[-1]",
    r"\_u[0x0010]\_m[0x01]\&[Uuml]\&[uuml]",
    r"\n\n[half]\n",
    r"\![open,teachbox]\e",
    r'\![raise,OnUserEvent,"0,100"]\e',
    r'\![raise,"On"User"Event",%username,,"",a"","""","foo,bar"]\e',
    r'\_a[http://www.asahi.com/]Asahi.com\_a\_s\_a[test]foo\_a\e',
    r'\_a[test]%j[http://www.asahi.com]%hour時%minute分%second秒\_a',
    r"\![raise,OnWavePlay,voice\hello.mp3]\e",
    r"\q[Asahi.com,新聞を読む]",
    r"\j[\s4]\e",
    # illegal cases (to be passed)
    r"20%終了 (%hour時%minute分%second秒)",
    r"\g",
    # illegal cases
    r"\j[http://www.asahi",
    r"\s\e",
    r"\j4\e",
    r"\q0[#temp0]\e",
    r"\q[test]\e",
    r"\q[foo,bar,test]\e",
    r"\q[起動時間,%exh時間]\e",
    r"\q[,]\e",
    r"\URL[しんぶーん][http://www.asahi.com/]\e",
    r"\_atest\_a",
    r"\_a[test]",
    ]

def test_tokenizer():
    parser = Parser()
    for test in testcases:
        try:
            print parser.tokenize(test)
        except ParserError, e:
            print e

def test_parser(error="strict"):
    parser = Parser(error)
    for test in testcases:
        print "*" * 60
        print test
        try:
            print_script_tree(parser.parse(test))
        except ParserError, e:
            print " " * e.column + "^" * (e.length or 1)
            print e

def print_script_tree(tree):
    for node in tree:
        if node[0] == SCRIPT_TAG:
            name, args = node[1], node[2:]
            print "TAG", name
            for n in range(len(args)):
                if type(args[n]) == type(''):
                    print "\tARG#%d\t%s" % (n+1, args[n])
                else:
                    print "\tARG#%d\tTEXT" % (n+1)
                    print_text(args[n], 2)
        elif node[0] == SCRIPT_TEXT:
            print "TEXT"
            print_text(node[1], 1)

def print_text(text, indent):
    for chunk in text:
        if chunk[0] == TEXT_STRING:
            print "\t" * indent + 'STRING\t"%s"' % chunk[1]
        elif chunk[0] == TEXT_META:
            name, args = chunk[1], chunk[2:]
            print "\t" * indent + "META\t" + name
            for n in range(len(args)):
                print "\t" * indent + "\tARG#%d\t%s" % (n+1, args[n])

if __name__ == "__main__":
    import os, sys
    if len(sys.argv) == 2 and sys.argv[1] == "tokenizer":
        test_tokenizer()
    elif len(sys.argv) == 3 and sys.argv[1] == "parser":
        test_parser(sys.argv[2])
    else:
        print "Usage:", os.path.basename(sys.argv[0]), "[tokenizer|parser [strict|loose]]"

# Syntax of the Sakura Script:
#   "\e"
#   "\h"
#   "\u"
#   "\s" OpenedSbra Number ClosedSbra
#   "\b" OpenedSbra Number ClosedSbra
#   "\n" (OpenedSbra Text ClosedSbra)?
#   "\w" Number
#   "\_w" OpenedSbra Number ClosedSbra
#   "\j" OpenedSbra ID ClosedSbra
#   "\c"
#   "\x"
#   "\t"
#   "\_q"
#   "\_s"
#   "\_n"
#   "\q" Number OpenedSbra Text ClosedSbra OpenedSbra Text ClosedSbra
#   "\q" OpenedSbra Text "," ID ClosedSbra
#   "\z"
#   "\y"
#   "\*"
#   "\v"
#   "\8" OpenedSbra ID ClosedSbra
#   "\m" OpenedSbra ID ClosedSbra
#   "\i" OpenedSbra ID ClosedSbra
#   "\_e"
#   "\a"
#   "\!" OpenedSbra Text ClosedSbra
#   "\_c" OpenedSbra Text ClosedSbra
#   "\__c"
#   "\URL" OpenedSbra Text ClosedSbra [ OpenedSbra Text ClosedSbra OpenedSbra Text ClosedSbra ]*
#   "\&" OpenedSbra ID ClosedSbra
#   "\_u" OpenedSbra ID ClosedSbra
#   "\_m" OpenedSbra ID ClosedSbra
#   "\_a" OpenedSbra ID ClosedSbra Text "\_a"
