#
# referenciate.rb:
# a library to markup text with HTML references
#
# $Id: ucs-transformation.rb,v 1.6 2007/01/26 03:15:17 zunda Exp $
#
# Copyright:: Copyright (C) 2007 zunda <zunda at freeshell.org>
# License:: GPL
#

module EntityReference
	module UcsTransformation
		class UcsTransformationError < StandardError; end

		def raise_if_invalid_ucs(n)
			if n < 0 or (0xd800 <= n and n <= 0xdfff) or 0xdffe <= n
				raise UcsTransformationError, "UCS code out of range"
			end
		end
		module_function :raise_if_invalid_ucs

		# returns the code as a number
		def utf8_to_ucs(char, secure = true)
			return char[0] if 0x7f >= char[0]	# ASCII

			bytes = char.unpack('C*')

			uc_bits = 0b00011111
			tf_bits = 0b11100000
			tf_flag = 0b11000000
			utf8_length = 2

			loop do
				break if tf_flag == bytes[0] & tf_bits

				uc_bits = uc_bits >> 1
				if 0 == uc_bits
					raise UcsTransformationError, "Invalid first byte of a UTF-8 character"
				end
				tf_bits = 0b10000000 | (tf_bits >> 1)
				tf_flag = 0b10000000 | (tf_flag >> 1)
				utf8_length += 1
			end
			if utf8_length > bytes.length
				raise UcsTransformationError, "UTF-8 bytes too short"
			end
			if utf8_length < bytes.length
				raise UcsTransformationError, "UTF-8 bytes too lnog"
			end

			result = uc_bits & bytes.shift
			if 0 == result
				raise UcsTransformationError, "Malformed UTF-8 bytes"
			end
			until bytes.empty?
				result = result << 6
				result |= 0b00111111 & bytes.shift
			end

			raise_if_invalid_ucs(result) if secure
			return result
		end
		module_function :utf8_to_ucs

		# converts the code into string
		def ucs_to_utf8(number, secure = true)
			raise_if_invalid_ucs(number) if secure
			return number.chr if 0x7F >= number	# ASCII

			uc_bits = 0b00011111	# bits in the first byte to be used for character
			tf_bits = 0b11100000	# bits in the first byte to be used for TF flag
			tf_flag = 0b11000000	# bits in the first byte to show length
			cur_number = number
			result = ''
			loop do
				result = (0x80 | (cur_number & 0x3f)).chr + result
				cur_number = cur_number >> 6
				if 0 == cur_number & tf_bits	# most significant byte
					result = (tf_flag | (cur_number & uc_bits)).chr + result
					break
				else	# continuing
					uc_bits = uc_bits >> 1
					if 0 == uc_bits
						raise UcsTransformationError, "Too many bytes for UTF-8"
					end
					tf_bits = 0b10000000 | (tf_bits >> 1)
					tf_flag = 0b10000000 | (tf_flag >> 1)
				end
			end

			return result
		end
		module_function :ucs_to_utf8
	end
end
