#!/usr/bin/ruby
#
# blazer-jp-proxy: a proxy server for Palm/Blazer
#
# $Id: blazer-ja-proxy,v 1.25 2006/04/18 22:19:33 zunda Exp $
#
# Copyright:: Copyright (C) 2006 zunda <zunda at freeshell.org>
# License:: GPL
#

require 'webrick'
require 'webrick/httpproxy'

require 'nkf'
require 'uri'
require 'zlib'
require 'stringio'
require 'thread'
require 'cgi'

require 'socket'

# server configurations
if __FILE__ == $0 then
	bind_address = Socket.gethostname
	bind_port = 8080
	proxy_user_name = 'palm'
	proxy_user_pass = 'shSIlTjodlY/6'
	# salt = [rand(64),rand(64)].pack("C*").tr("\x00-\x3f","A-Za-z0-9./")
	# "((%password%))".crypt(salt)
	gzip = true

	devel_log = nil
	server_log = nil
end

module BlazerProxy

	#
	# configuration file
	#
	class ConfigurationError < StandardError; end
	class Configuration
		attr_reader :path

		@@defaults = {
			'bind address' => Socket.gethostname,
			'bind port' => 8080,
			'proxy username' => 'palm',
			'proxy userpass' => 'shSIlTjodlY/6',
			# salt = [rand(64),rand(64)].pack("C*").tr("\x00-\x3f","A-Za-z0-9./")
			# "((%password%))".crypt(salt)
			'gzip' => true,
			'devel log' => nil,
			'server log' => nil,
		}

		@@paths = [
			'/etc/blazer-ja-proxy.conf',
			'~/.blazer-ja-proxy.conf',
			'./blazer-ja-proxy.conf',
		]

		def [](entry)
			@confs[entry]
		end

		def initialize(paths = @@paths)
			@confs = Hash.new
			_read(paths)
		end

		def _read(paths = @@paths)
			@path = nil
			paths.each do |p|
				begin
					File.open(File.expand_path(p)) do |f|
						_parse(f)
					end
					@path = p
					break
				rescue Errno::ENOENT
				end
			end
		end

		def _parse(file)
			@confs = @@defaults.dup
			file.each_line do |l|
				k, v = l.chomp.split(/\s*:\s*/, 2)
				if @confs.has_key?(k.downcase) then
					if v and not v.empty? then
						@confs[k.untaint] = v.untaint
					else
						@confs[k.untaint] = nil
					end
				else
					raise ConfigurationError, "#{file.path}:#{file.lineno}: keyword `#{k}' not allowed"
				end
			end
		end

	end

	#
	# charactor encodings
	#
	@@charset_to_nkf = {
		/\Aiso-8859-\d\Z/i => NKF::ASCII,
		/\Aeuc-jp\Z/i => NKF::EUC,
		/\Ax-sjis\Z/i => NKF::SJIS,
		/\Ashift[_-]jis\Z/i => NKF::SJIS,
		/\Aiso-2022-jp\Z/i => NKF::JIS,
		/\Autf-8\Z/i => NKF::UTF8,
		/\Autf-16\Z/i => NKF::UTF16,
	}

	# returns NKF::* const for charset string
	def self::charset_to_nkf(charset_string)
		@@charset_to_nkf.each_pair do |regex, result|
			return result if regex =~ charset_string
		end
		nil
	end

	@@nkf_to_charset = {
		NKF::ASCII => 'ISO-8859-1',
		NKF::EUC => 'EUC-JP',
		NKF::JIS => 'ISO-2022-JP',
		NKF::SJIS => 'Shift-JIS',
		NKF::UTF8 => 'UTF-8',
		NKF::UTF16 => 'UTF-16',
		NKF::BINARY => nil,
		nil => nil,
	}

	# returns a string showing charset
	def self::nkf_to_charset(nkf_const)
		if @@nkf_to_charset.has_key?(nkf_const) then
			@@nkf_to_charset[nkf_const] 
		else
			raise RuntimeError, "Unknown NKF:: constant #{nkf_const}"
		end
	end

	# returns NKF::* const for content type
	def self::content_type_to_nkf(content_type)
		if content_type then
			content_type.split(/\s*;\s*/).each do |p|
				a = p.split(/\s*=\s*/)
				return charset_to_nkf(a[1]) if a[1] and /charset/i =~ a[0]
			end
			#return charset_to_nkf('iso-8859-1') if /\Atext/i =~ content_type
			# violation of RFC-2616 Section 3.7.1 but we want to look in the body
			return NKF::BINARY if /\Aimage/i =~ content_type
		end
		return nil
	end

	# detects multibyte chars
	@@regexp_multibyte = Regexp.union(
		/[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]/n,	# shift-jis
		/[\xa1-\xfe][\xa1-\xfe]/n,	# euc-jp
		/[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80\xbf]/n	# utf-8
	)	# Ruby recpie book
	@@regexp_7bit_jis = /\x1b\$B|\x1b\$\(D|\x0e\x1b\(I/	# jiskanji (5)
	def self::have_multibyte?(string)
		@@regexp_multibyte =~ string || @@regexp_7bit_jis =~ string
	end

	# converts a string
	def self::convert(string, charset_out, charset_in = nil)
		return string unless have_multibyte?(string)
		opts = Array.new
		# output code
		case charset_out
		when NKF::EUC;   opts.push('-e')
		when NKF::SJIS;  opts.push('-s')
		when NKF::JIS;   opts.push('-j')
		when NKF::UTF8;  opts.push('-w')
		when NKF::UTF16; opts.push('-w16')
		end
		return string if opts.empty?
		# input code
		case charset_in
		when NKF::EUC;   opts.push('-E')
		when NKF::SJIS;  opts.push('-S')
		when NKF::JIS;   opts.push('-J')
		when NKF::UTF8;  opts.push('-W')
		when NKF::UTF16; opts.push('-W16')
		end
		# no mime decoding
		opts.push('-m0')
		# convert 
		NKF.nkf(opts.join(' '), string)
	end

	def self::convert_urlencoded(str, charset_out)
		# HTTPUtils::parse_query can't be used. Sequence has to be conserved.
		str.split(/(&)|(;)/).map{|chunk|
			if m = chunk.match(/=/) then
				[m.pre_match, m.post_match].map{|q|
					CGI.escape(BlazerProxy::convert(CGI.unescape(q), charset_out))
				}.join('=')
			elsif /\A(?:&|;)\Z/ =~ chunk
				chunk
			else
				CGI.escape(BlazerProxy::convert(CGI.unescape(chunk), charset_out))
			end
		}.join('')
	end

	@@crlf = "\x0d\x0a"
	def self::convert_form_data(str, boundary, charset_out)
		# HTTPUtils::parse_form_data can't be used. Sequence has to be conserved.
		str.split(/^(--#{boundary}(?:--)?#{@@crlf})/).map{|chunk|
			m = chunk.match(/#{@@crlf}#{@@crlf}/)
			if m then
				head = m.pre_match
				body = m.post_match

				# extract content-type
				headers = head.split(/#{@@crlf}/).map{|l| l.split(/:\s*/, 2)}
				if a = headers.find{|e| /\Acontent-type\z/i =~ e[0]} then
					content_type = a[1]
				else
					content_type = 'text/plain'
					# RFC2388: content type defaults to text/plain
				end

				# convert
				case content_type
				when /text\/plain/i
					body = BlazerProxy::convert(body, charset_out)
					chunk = head + @@crlf + @@crlf + body
				end
			end
			chunk
		}.join('')
	end

	#
	# URL
	#

	# returns canonical URL for URI
	def self::canonical_url(uri)
		return nil if not uri or uri.userinfo or uri.registry
		r = uri.dup
		r.query = nil
		r.opaque = nil
		r.fragment = nil
		r.normalize.to_s
	end

	#
	# registry of character encodings
	#

	class CharsetRegistry
		@@need_register = {
			NKF::ASCII => true,
			NKF::EUC => true,
			NKF::JIS => true,
			NKF::SJIS => true,
			NKF::UTF8 => true,
			NKF::UTF16 => true,
		}

		attr_reader :registry

		# remembers charsets for latest about max URLs
		def initialize(max = 1000)
			@registry = Hash.new
			@sequence = Array.new
			@max = max
		end

		def [](uri)
			canonical = BlazerProxy::canonical_url(uri)
			if canonical then
				@registry[canonical.to_sym]
			else
				nil
			end
		end

		def []=(uri, charset)
			canonical = BlazerProxy::canonical_url(uri)
			if canonical and charset and @@need_register[charset] then
				# register
				s = canonical.to_sym
				@sequence.delete(s) if @registry.has_key?(s)
				@registry[s] = charset
				@sequence.push(s)

				# clean up old entries
				if @sequence.size > @max then
					halfmax = @max / 2
					while @sequence.size > halfmax
						@registry.delete(@sequence.shift)
					end
				end

				charset
			else
				nil
			end
		end

	end

	#
	# proxy instance
	#
	class Proxy
		attr_reader :auth_proc

		def initialize(
			bind_address = Socket.gethostname,
			bind_port = 8080,
			proxy_user_name = nil,
			proxy_user_pass = nil,
			gzip  = true,
			devel_log_filename = nil,
			server_log_filename = nil,
			server_log_depth = WEBrick::Log::DEBUG,
			proxy_via = false,
			browser_charset = NKF::EUC
		)
			@bind_address = bind_address
			@bind_port = bind_port
			@proxy_user_name = proxy_user_name
			@proxy_user_pass = proxy_user_pass
			@gzip = gzip
			@devel_log_filename = devel_log_filename
			@server_log_filename = server_log_filename
			@server_log_depth = server_log_depth
			@proxy_via = proxy_via
			@browser_charset = browser_charset

			# charset registry
			@charsets = BlazerProxy::CharsetRegistry.new
			@charsets_lock = Mutex.new

			# development log file
			@devel_log_file = @devel_log_filename ? File.open(@devel_log_filename, 'w') : nil

			@auth_proc = @proxy_user_pass ? Proc.new{|req, res|
				WEBrick::HTTPAuth.proxy_basic_auth(req, res, File.basename($0)) {|user, pass|
					user == @proxy_user_name && pass.crypt(@proxy_user_pass) == @proxy_user_pass
				}
			} : nil

			@request_filter = Proc.new{|req, res|
				# record the request for debug for debug
				logging = @devel_log_file and req.request_uri and 'https' != req.request_uri.scheme
				if logging then
					@devel_log_file.puts "\nREQUEST:"
					@devel_log_file.puts "original: #{req.request_uri}"
					@devel_log_file.puts "content-type: #{req.content_type}"
					@devel_log_file.puts "body: #{req.body[0..65].inspect}" if req.body
				end

				# remember charset
				referer = begin
					URI.parse(req['referer'])
				rescue URI::InvalidURIError
					nil
				end
				charset_out = nil
				@charsets_lock.synchronize do
					charset_out = (referer and @charsets[referer]) || @charsets[req.request_uri]
				end
				if logging then
					@devel_log_file.puts "charset recorded in proxy: #{BlazerProxy::nkf_to_charset(charset_out) || 'none'}"
				end

				# convert
				req.convert_to!(charset_out)

				# record the result
				if logging then
					@devel_log_file.puts "converted: #{req.request_uri}"
					@devel_log_file.puts "content-type: #{req.content_type}"
					@devel_log_file.puts "body: #{req.body[0..65].inspect}" if req.body
				end
			}

			@response_filter = Proc.new{|req, res|
				# record the response header for debug
				logging = @devel_log_file and req.request_uri and 'https' != req.request_uri.scheme
				if logging then
					@devel_log_file.puts "\nORIGINAL RESPONSE HEADER for\n  #{res.request_uri}"
					res.each do |head, val|
						@devel_log_file.puts "#{head}: #{val}"
					end
				end

				case res.content_type
				when %r{\A(?:text/html|application/.*xhtml\+xml|text/plain)}
					# record charset
					@charsets_lock.synchronize do
						@charsets[res.request_uri] = res.charset
					end
					if logging then
						@devel_log_file.puts "charset detected in proxy: #{BlazerProxy::nkf_to_charset(res.charset)}"
					end
					# decode
					res.decode!
					# convert charset
					res.convert_to!(@browser_charset)
					# gzip
					res.gzip! if gzip
				when /\Aimage\//
					# shrink to fit display
				end

				# record the conveted response for debug
				if logging then
					@devel_log_file.puts "\nCONVERTED RESPONSE:"
					res.each do |head, val|
						@devel_log_file.puts "#{head}: #{val}"
					end
					@devel_log_file.puts "body:"
					if res.body then
						@devel_log_file.puts res.body[0..65].inspect + '...'
					end
					@devel_log_file.flush
				end
			}

			@server_config = {
				:BindAddress => @bind_address,
				:Port => @bind_port,
				:Logger => @server_log_filename ? WEBrick::Log::new(@server_log_filename, @server_log_depth) : nil,
				:ProxyAuthProc => @auth_proc,
				:RequestHandler => @request_filter,
				:ProxyContentHandler => @response_filter,
				:ProxyVia => @proxy_via,
			}
		end

		# start the server and block
		def start
			s = WEBrick::HTTPProxyServer.new(@server_config)
			Signal.trap('INT') do
				s.shutdown
				@devel_log_file.close if @devel_log_file
			end
			s.start
		end
	end

end

#
# extensions to WEBrick
#
module WEBrick
	class HTTPResponse

		# decode if encoded
		def decoded_body
			unless defined?(@decoded_body)
				# decode if encoded
				@decoded_body = body
				if body then
					if not body.empty? then
						case self['content-encoding']
						when 'gzip'
							gzfile = Zlib::GzipReader.new(StringIO.new(body))
							@decoded_body = gzfile.read
							gzfile.close
						when 'deflate'
							@decoded_body = Zlib::Inflate.inflate(body)
						end
					end
				end
			end
			@decoded_body
		end

		# returns NKF::* const
		def charset
			@charset = _charset unless defined?(@charset)
			@charset
		end

		def _charset
			# check Content-type: header
			r = ::BlazerProxy::content_type_to_nkf(content_type) and return r
			if body and not body.empty? then
				# check HTML header
				decoded_body.scan(/<meta\s+(.*?)>/i).map{|e| e[0]}.each do |meta|
					if /http-equiv="?content-type"?/i =~ meta and /content="(.*?)"/i =~ meta then
						r = ::BlazerProxy::content_type_to_nkf($1) and return r
					end
				end
				# check XML header
				# http://suika.fam.cx/~wakaba/-temp/wiki/wiki?XML%2F%2Fcharset
				if /\A<\?xml/i =~ decoded_body then
					if /\A<\?xml[^>]*\bencoding="(.*?)"[^>]*>/i =~ decoded_body then
						r = ::BlazerProxy::charset_to_nkf($1) and return r
					end
					return NKF::UTF8
				end
				if content_type and /\bxml\b/ =~ content_type then
					return NKF::UTF16 if 0xFEFF ==  decoded_body.unpack('S1')[0]
					return NKF::UTF16 if 0xFFFE ==  decoded_body.unpack('S1')[0]
					return NKF::UTF8 if [0xEF, 0xBB, 0xBF] ==  decoded_body.unpack('C3')
				end
				# guess from body
				r = NKF::guess(decoded_body)
				return r unless NKF::UNKNOWN == r || NKF::BINARY == r
			end
			return nil
		end
		private :_charset

		# decode
		def decode!
			self.body = decoded_body
			self['content-encoding'] = nil
			self['content-length'] = @body ? @body.size : 0
			self
		end

		# encode with gzip
		def gzip!
			if not self['content-encoding'] or self['content-encoding'].empty? then
				bodyio = StringIO.new
				gzfile = Zlib::GzipWriter.new(bodyio)
				gzfile.print body
				gzfile.close
				self.body = bodyio.string
				self['content-encoding'] = 'gzip'
				self['content-length'] = @body ? @body.size : 0
			end
		end

		# convert character set
		def convert_to!(charset_out)
			# rewrite internal state
			charset_in = charset
			@charset = charset_out
			charset_str = BlazerProxy::nkf_to_charset(charset_out)

			# HTTP header
			s = content_type.sub(/;\s*charset\s*=\s*[^\s;]*/i, '') + "; charset=#{charset_str}"
			self.content_type = s
			return self if not body or body.empty?

			s = self.body

			# HTML header
			have_meta = false
			s.gsub!(/(<meta\s+(.*?)>)/i) do
				r = $1
				meta = $2
				if /http-equiv="?content-type"?/i =~ meta and /(content=".*?;(\s*.*?)")/i =~ meta then
					have_meta = true
					content_in = $1
					charset_from = $2
					content_out = content_in.sub(Regexp.new(Regexp.escape(charset_from)), " charset=#{charset_str}")
					r.sub!(Regexp.new(Regexp.escape(content_in)), content_out)
				end
				r
			end

			# insert meta tag if missing
			if /\Atext\/html/i =~ content_type and not have_meta then
				# create a space for <head>...</head>
				unless /<head.*<\/head>/im =~ s then
					s.sub!(/(<body>)/i){"<head></head>#{$1}"} or \
					s.sub!(/(<html>)/i){"#{$1}<head></head>"} or \
					s.sub!(/\A/){'<head></head>'}
				end
				# insert the meta tag
				s.sub!(/(<\/head>)/i){
					end_head = $1
					need_post = (/\A<\?xml/ =~ s) || (/\A<!DOCTYPE.*XHTML/ =~ s)
					%Q|<meta http-equiv="content-type" content="text/html; charset=#{charset_str}"#{need_post ? '/' : ''}>#{end_head}|
				}
			end

			# XML header
			s.gsub!(/(\A<\?xml[^>]*\bencoding="(.*?)"[^>]*>)/i) do
				r = $1
				charset_in = $2
				r.sub!(charset_in, charset_str)
				r
			end
			self.body = s

			# convert body
			unless charset_in == charset_out then
				s = BlazerProxy::convert(body, charset_out, charset_in)
				self.body = s
				self['content-length'] = body.size.to_s
			end

			self
		end
	end

	class HTTPRequest
		def unparsed_uri=(uri)
			@unparsed_uri = uri
			# The code below is copied from webrick/httprequest.rb,v 1.64
			# came with ruby 1.8.3 (2005-09-21)
			# Copyright (c) 2000, 2001 TAKAHASHI Masayoshi, GOTOU Yuuzou
			# Copyright (c) 2002 Internet Programming with Ruby writers.
			begin
				@request_uri = parse_uri(@unparsed_uri)
				@path = HTTPUtils::unescape(@request_uri.path)
				@path = HTTPUtils::normalize_path(@path)
				@host = @request_uri.host
				@port = @request_uri.port
				@query_string = @request_uri.query
				@script_name = ""
				@path_info = @path.dup
			rescue
				raise HTTPStatus::BadRequest, "bad URI `#{@unparsed_uri}'."
			end
		end

		def convert_query_to!(charset_out)
			a = URI.split(self.unparsed_uri)
			if a[7] and not a[7].empty? then
				a[7] = BlazerProxy::convert_urlencoded(a[7], charset_out)
				self.unparsed_uri = URI::Generic.build(a).to_s
			end
		end

		def convert_body_to!(charset_out)
			changed = false
			if body and not body.empty? then

				# convert body
				if /^application\/x-www-form-urlencoded/i =~ self['content-type'] then
					s = BlazerProxy::convert_urlencoded(body, charset_out)
					@body = s
					changed = true
				elsif /^multipart\/form-data;\s+boundary=(.+)/i =~ self['content-type'] then
					boundary = HTTPUtils::dequote($1)
					s = BlazerProxy::convert_form_data(body, boundary, charset_out)
					@body = s
					changgd = true
				end

				# update content-length
				if changed then
					newheader = @raw_header.map do |h|
						if /content-length/i =~ h then
							h.sub(/:(\s*).*/){':' + $1 + body.size.to_s}
						else
							h
						end
					end
					@raw_header = newheader
					@header = HTTPUtils::parse_header(@raw_header)
				end

			end
		end

		def convert_to!(charset_out)
			self.convert_query_to!(charset_out)
			self.convert_body_to!(charset_out)
		end
	end
end

if __FILE__ == $0 then
	proxy = BlazerProxy::Proxy.new(
		bind_address,
		bind_port,
		proxy_user_name,
		proxy_user_pass,
		gzip,
		devel_log,
		server_log
	)
	proxy.start
end
