#!/usr/local/bin/python

# gen-jis-table.py (c) 2008-2011 exeal
#
# Generates the several mapping tables between JIS and UCS at
# src/encodings/generated/jis.ipp. Two source files are required:
# - jisx0213-2004-std.txt (from http://x0213.org/codetable/index.en.html)
# - JIS0212.txt (from Unicode.org)

import sys
import datetime
import re
import os

NATIVE_REPLACEMENT_BYTES = '0x0000'
UNICODE_REPLACEMENT_CHARACTER = '0xfffd'

def reverse_dictionary(d):
	temp = {}
	for key in d:
		temp[d[key]] = key
	return temp

def dump_table(out, table, type_string, replacement):
	number_of_characters = 0
	high_min, high_max, low_min, low_max = 0xff, 0x00, 0xff, 0x00
	for high in range(0x00, 0x100):
		empty_lines = [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]
		for low in range(0x00, 0x100):
			if (low | (high << 8)) in table:
				empty_lines[low >> 4] = False
		if all(empty_lines):
			out.write('0')
		else:
			if high != 0x00:
				high_min, high_max = min(high, high_min), max(high, high_max)
			out.write(type_string + 'Wire<\n')
			low = 0x00
			while low < 0x100:
				if low % 16 == 0:
					if empty_lines[low >> 4]:
						out.write('\tEmpty' + type_string + 'Line')
						low += 16
						if low < 0x100:
							out.write(',')
						out.write('\n')
						continue
					out.write('\t' + type_string + 'Line<')
				c = low | (high << 8)
				if c in table:
					out.write('0x%04x' % table[c])
					number_of_characters += 1
					if high != 0x00:
						low_min, low_max = min(low, low_min), max(low, low_max)
				else:
					out.write(replacement)
				if low % 16 != 15:
					out.write(',')
				elif low != 0xff:
					out.write('>,\n')
				else:
					out.write('>\n')
				low += 1
			out.write('>::VALUES')
		if high != 0xff:
			out.write(',')
		out.write('\n')

	# report number of characters
	out.write('// # of characters : %d\n' % number_of_characters)
	# report bytes range
	out.write('// lead octet range : [\\x%02X-\\x%02X]\n' % (high_min, high_max))
	out.write('// trail octet range : [\\x%02X-\\x%02X]\n' % (low_min, low_max))

def remove_bmp(table):
	temp = {}
	for k in table:
		v = table[k]
		if k > 0x10ffff:
			temp[k] = v
		elif k >= 0x20000:
			temp[k - 0x20000] = v
	return temp


if len(sys.argv) != 3:
	exit('usage: python gen-jis-table.py <jisx0213-2004-std.txt> <JIS0212.TXT>')

x0208_to_ucs, x0212_to_ucs, x0213_p1_to_ucs, x0213_p2_to_ucs = {}, {}, {}, {}

# read from the input file
basic_pattern = re.compile(r'^([34])\-([0-9A-Fa-f]{4})\tU\+([0-9A-Fa-f]{4,5})')
second_pattern = re.compile(r'^\+([0-9A-Fa-f]{4,5})')
fullwidth_pattern = re.compile(r'Fullwidth:\s?U\+([0-9A-Fa-f]{4,5})')
input = open(sys.argv[1])
try:
	for line in input:
		m = basic_pattern.search(line)
		if m == None:
			continue
		# obtain plane, JIS value, first UCS value
		plane = int(m.group(1)) - 2
		jis = int(m.group(2), 16)
		ucs = int(m.group(3), 16)
		is_x0213 = plane == 2
		# obtain additional information
		tail = line[m.end():]
		if not is_x0213:
			is_x0213 = tail.find('[200') != -1
		m = fullwidth_pattern.search(tail)
		if m != None:
			ucs = int(m.group(1), 16)	# 'Fullwidth: U+xxxx'
		elif is_x0213:
			m = second_pattern.search(tail)
			if m != None:
				ucs = ucs << 16 | int(m.group(1), 16)	# 'U+xxxx+xxxx'
		# write into the table
		if not is_x0213:
			x0208_to_ucs[jis] = ucs
		elif plane == 1:
			x0213_p1_to_ucs[jis] = ucs
		else:
			x0213_p2_to_ucs[jis] = ucs
except:
	input.close()
	exit('error: I/O error occured during reading %s.' % sys.argv[1])
input.close()

basic_pattern = re.compile(r'^0x([0-9A-Fa-f]{4})\t0x([0-9A-Fa-f]{4})')
input = open(sys.argv[2])
try:
	for line in input:
		m = basic_pattern.search(line)
		if m != None:
			x0212_to_ucs[int(m.group(1), 16)] = int(m.group(2), 16)
except:
	input.close()
	exit('error: I/O error occured during reading %s.' % sys.argv[2])
input.close()

out = open('../src/encodings/generated/jis.ipp', 'w')
out.write('// jis.ipp\n')
out.write('// automatically generated by gen-jis-table.py at %s\n\n' % datetime.datetime.now())

print('generating JIS X 0212:1990 to UCS mapping table...')
out.write('\nconst Char** JIS_X_0212_TO_UCS[256] = {\n')
dump_table(out, x0212_to_ucs, 'Char', UNICODE_REPLACEMENT_CHARACTER)
out.write('};\n\n')

print('generating UCS to JIS X 0212:1990 mapping table...')
out.write('\nconst uint16_t** UCS_TO_JIS_X_0212[256] = {\n')
dump_table(out, reverse_dictionary(x0212_to_ucs), 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

print('generating JIS X 0208:1997 to UCS mapping table...')
out.write('const Char** const JIS_X_0208_TO_UCS[256] = {\n')
dump_table(out, x0208_to_ucs, 'Char', UNICODE_REPLACEMENT_CHARACTER)
out.write('};\n\n')

print('generating UCS to JIS X 0208:1997 mapping table...')
out.write('const uint16_t** const UCS_TO_JIS_X_0208[256] = {\n')
dump_table(out, reverse_dictionary(x0208_to_ucs), 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

print('generating JIS X 0213:2004 plane 1 to UCS mapping table...')
out.write('const CodePoint** const JIS_X_0213_PLANE_1_TO_UCS[256] = {\n')
dump_table(out, x0213_p1_to_ucs, 'CodePoint', UNICODE_REPLACEMENT_CHARACTER)
out.write('};\n\n')

print('generating UCS BMP to JIS X 0213:2004 plane 1 mapping table...')
out.write('const uint16_t** const UCS_BMP_TO_JIS_X_0213_PLANE_1[256] = {\n')
ucs_to_x0213_p1 = reverse_dictionary(x0213_p1_to_ucs)
dump_table(out, ucs_to_x0213_p1, 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

print('generating UCS SIP to JIS X 0213:2004 plane 1 mapping table...')
out.write('const uint16_t** const UCS_SIP_TO_JIS_X_0213_PLANE_1[256] = {\n')
ucs_to_x0213_p1 = remove_bmp(ucs_to_x0213_p1)
dump_table(out, ucs_to_x0213_p1, 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

print('generating JIS X 0213:2004 plane 2 to UCS mapping table...')
out.write('const CodePoint** const JIS_X_0213_PLANE_2_TO_UCS[256] = {\n')
dump_table(out, x0213_p2_to_ucs, 'CodePoint', UNICODE_REPLACEMENT_CHARACTER)
out.write('};\n\n')

print('generating UCS BMP to JIS X 0213:2004 plane 2 mapping table...')
out.write('const uint16_t** const UCS_BMP_TO_JIS_X_0213_PLANE_2[256] = {\n')
ucs_to_x0213_p2 = reverse_dictionary(x0213_p2_to_ucs)
dump_table(out, ucs_to_x0213_p2, 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

print('generating UCS SIP to JIS X 0213:2004 plane 2 mapping table...')
out.write('const uint16_t** const UCS_SIP_TO_JIS_X_0213_PLANE_2[256] = {\n')
ucs_to_x0213_p2 = remove_bmp(ucs_to_x0213_p2)
dump_table(out, ucs_to_x0213_p2, 'DBCS', NATIVE_REPLACEMENT_BYTES)
out.write('};\n\n')

# double UCS to JIS X 0213 mappings
out.write('// the following mappings can\'t include the mapping tables.\n')
leading_bytes = []
doubled_ucss = ucs_to_x0213_p1.keys()
doubled_ucss.sort()
for ucs in doubled_ucss:
	if ucs > 0x10ffff:
		leading_bytes.append(ucs >> 16)
		out.write('// <U+%04X, U+%04X> => 1-0x%04X\n' % (ucs >> 16, ucs & 0xffff, ucs_to_x0213_p1[ucs]))
doubled_ucss = ucs_to_x0213_p2.keys()
doubled_ucss.sort()
for ucs in doubled_ucss:
	if ucs > 0x10ffff:
		leading_bytes.append(ucs >> 16)
		out.write('// <U+%04X, U+%04X> => 2-0x%04X\n' % (ucs >> 16, ucs & 0xffff, ucs_to_x0213_p2[ucs]))
leading_bytes = list(set(leading_bytes))
leading_bytes.sort()
out.write('const Char LEADING_BYTES_TO_JIS_X_0213[] = {')
for c in leading_bytes:
	out.write('0x%04x,' % c)
out.write('};\n')

out.close()
print('** completed **')
