// Encoder.h
// (c) 2004-2006 exeal

#ifndef ENCODER_H_
#define ENCODER_H_

#include "../UnicodeUtils.h"
#include <cassert>
#include <set>
#include <map>
#include <memory>	// std::auto_ptr


namespace Ascension {

/// GR[h֌W
namespace Encodings {

/// Windows R[hy[W
typedef uint CodePage;

//	Windows R[hy[WɖA͎Ŏ镶R[h
const CodePage
	CPEX_UNICODE_UTF16LE		= 1200,		///< UTF-16
	CPEX_UNICODE_UTF16BE		= 1201,		///< UTF-16 big endian
	CPEX_UNICODE_UTF32LE		= 12000,	///< UTF-32
	CPEX_UNICODE_UTF32BE		= 12001,	///< UTF-32 big endian
	CPEX_AUTODETECT				= 50001,	///< 
	CPEX_JAPANESE_AUTODETECT	= 50932,	///< { (I)
	CPEX_KOREAN_AUTODETECT		= 50949,	///< ؍ (I)
	CPEX_AUTODETECT_SYSTEMLANG	= 70000,	///< I (VXě)
	CPEX_AUTODETECT_USERLANG	= 70001,	///< I ([Ǔ)
	CPEX_UNICODE_AUTODETECT		= 70010,	///< Unicode ()
	CPEX_UNICODE_UTF5			= 70011,	///< UTF-5
	CPEX_ARMENIAN_AUTODETECT	= 70020,	///< AjA (I)
	CPEX_ARMENIAN_ARMSCII7		= 70021,	///< AjA (ARMSCII-7)
	CPEX_ARMENIAN_ARMSCII8		= 70022,	///< AjA (ARMSCII-8)
	CPEX_ARMENIAN_ARMSCII8A		= 70023,	///< AjA (ARMSCII-8A)
	CPEX_VIETNAMESE_AUTODETECT	= 70030,	///< xgi (I)
	CPEX_VIETNAMESE_TCVN		= 70031,	///< xgi (TCVN)
	CPEX_VIETNAMESE_VISCII		= 70032,	///< xgi (VISCII)
	CPEX_VIETNAMESE_VPS			= 70033,	///< xgi (VPS)
	CPEX_JAPANESE_ISO2022JP		= 70040,	///< { (ISO-2022-JP)
	CPEX_JAPANESE_SHIFTJIS		= 70041,	///< { (Vtg JIS)
	CPEX_JAPANESE_ISO2022JP1	= 70042,	///< { (ISO-2022-JP-1)
	CPEX_JAPANESE_ISO2022JP2	= 70043,	///< { (ISO-2022-JP-2)
	CPEX_JAPANESE_EUC			= 70044,	///< { (EUC)
	CPEX_JAPANESE_ISO2022JP2004				= 70045,	///< { (ISO-2022-JP-2004)
	CPEX_JAPANESE_ISO2022JP2004_STRICT		= 70046,	///< { (ISO-2022-JP-2004-strict)
	CPEX_JAPANESE_ISO2022JP2004_COMPATIBLE	= 70047,	///< { (ISO-2022-JP-2004-compatible)
	CPEX_JAPANESE_ISO2022JP3			= 70048,	///< { (ISO-2022-JP-3)
	CPEX_JAPANESE_ISO2022JP3_STRICT		= 70049,	///< { (ISO-2022-JP-3-strict)
	CPEX_JAPANESE_ISO2022JP3_COMPATIBLE	= 70050,	///< { (ISO-2022-JP-3-compatible)
	CPEX_JAPANESE_SHIFTJIS2004	= 70051,	///< { (Shift_JIS-2004)
	CPEX_JAPANESE_EUCJIS2004	= 70052,	///< { (EUC-JIS-2004)
	CPEX_MULTILINGUAL_ISO2022_7BIT		= 70060,	///< }`K (ISO-2022, 7rbg)
	CPEX_MULTILINGUAL_ISO2022_7BITSS2	= 70061,	///< }`K (ISO-2022, 7rbg, SS2)
	CPEX_MULTILINGUAL_ISO2022_7BITSISO	= 70062,	///< }`K (ISO-2022, 7rbg, SI/SO)
	CPEX_MULTILINGUAL_ISO2022_8BITSS2	= 70063,	///< }`K (ISO-2022, 8rbg, SS2)
	CPEX_UNCATEGORIZED_BINARY	= 70070,	///< oCi
	CPEX_UNCATEGORIZED_NEXTSTEP	= 70071,	///< NEXTSTEP
	CPEX_UNCATEGORIZED_ATARIST	= 70072,	///< Atari ST/TT
	CPEX_THAI_TIS620	= 70080,	///< ^C (TIS 620-2533:1990)
	CPEX_LAO_MULELAO	= 70090,	///< I (MuleLao)
	CPEX_LAO_CP1133		= 70091,	///< I (ibm-1133)
	CPEX_IRISH_IS434	= 70100,	///< ACh (I.S. 434:1999)
	CPEX_TAMIL_TAB		= 70110,	///< ^~ (TAB)
	CPEX_TAMIL_TAM		= 70111,	///< ^~ (TAM)
	CPEX_TAMIL_TSCII	= 70112,	///< ^~ (TSCII 1.7)
	CPEX_HINDI_MACINTOSH	= 70115,	///< qfB[ (Macintosh, foiK)
	CPEX_GUJARATI_MACINTOSH	= 70116,	///< OW[g (Macintosh)
	CPEX_PANJABI_MACINTOSH	= 70117,	///< pWu (Macintosh, OL[)
	CPEX_CYRILLIC_MACINTOSH							= 10007,	///< L (Macintosh)
	CPEX_CYRILLIC_KOI8R								= 20866,	///< VA (KOI8-R)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS3				= 70120,	///< VA (DOS 3 VAT|[g)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS4ACADEMIC		= 70121,	///< VA (DOS 4 AJf~bNVAT|[g)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS3NONACADEMIC	= 70122,	///< VA (DOS 4 AJf~bNVAT|[g)
	CPEX_CYRILLIC_SOVIETKOI8BASIC					= 70123,	///< VA (\rGg KOI-8 {)
	CPEX_CYRILLIC_SOVIETKOI8ALTERNATIVE				= 70124,	///< VA (\rGg KOI-8 )
	CPEX_CYRILLIC_SOVIETKOI7						= 70125,	///< VA (\rGg KOI-7)
	CPEX_CYRILLIC_ECMA								= 70126,	///< L (ISO-IR-111, ECMA)
	CPEX_CYRILLIC_KOI8RU							= 70127,	///< L (KOI8-RU)
	CPEX_CYRILLIC_KOI8UNIFIED						= 70128,	///< L (KOI8 )
	CPEX_ISO8859_1	= 28591,	///< [bp (ISO-8859-1)
	CPEX_ISO8859_2	= 28592,	///< [bp (ISO-8859-2)
	CPEX_ISO8859_3	= 28593,	///< 새[bp (ISO-8859-3)
	CPEX_ISO8859_4	= 28594,	///< og (ISO-8859-4)
	CPEX_ISO8859_5	= 28595,	///< L (ISO-8859-5)
	CPEX_ISO8859_6	= 28596,	///< ArA (ISO-8859-6)
	CPEX_ISO8859_7	= 28597,	///< MV (ISO-8859-7)
	CPEX_ISO8859_8	= 28598,	///< wuC (ISO-8859-8)
	CPEX_ISO8859_9	= 28599,	///< gR (ISO-8859-9)
	CPEX_ISO8859_10	= 28600,	///< k (ISO-8859-10)
	CPEX_ISO8859_11	= 28601,	///< ^C (ISO-8859-11)
	CPEX_ISO8859_13	= 28603,	///< og (ISO-8859-13)
	CPEX_ISO8859_14	= 28604,	///< Pg (ISO-8859-14)
	CPEX_ISO8859_15	= 28605,	///< [bp (ISO-8859-15)
	CPEX_ISO8859_16	= 28606;	///< [bp (ISO-8859-16)

// oCgI[_[}[N
const uchar	UTF16LE_BOM[] = "\xFF\xFE";			///< UTF-16 gGfBA BOM
const uchar	UTF16BE_BOM[] = "\xFE\xFF";			///< UTF-16 rbOGfBA BOM
const uchar	UTF32LE_BOM[] = "\xFF\xFF\x00\x00";	///< UTF-32 gGfBA BOM
const uchar	UTF32BE_BOM[] = "\xFE\xFF\x00\x00";	///< UTF-32 rbOGfBA BOM
const uchar	UTF8_BOM[] = "\xEF\xBB\xBF";		///< UTF-8  BOM


// Encoder class definition
/////////////////////////////////////////////////////////////////////////////

// ϊłȂꍇ̊̕ ([Ǔꂩ擾قȂ)
const uchar NATIVE_DEFAULT_CHARACTER = '?';

// ZbgɃ}bvȂ
const wchar_t	REPLACEMENT_CHARACTER	= 0xFFFD;
const wchar_t	RP__CH					= REPLACEMENT_CHARACTER;
const uchar		N__A					= 0x00;

template<typename Ch> void setDefaultChar(Ch& ch);
template<> inline void setDefaultChar(char& ch) {ch = NATIVE_DEFAULT_CHARACTER;}
template<> inline void setDefaultChar(uchar& ch) {ch = NATIVE_DEFAULT_CHARACTER;}
template<> inline void setDefaultChar(ushort& ch) {ch = NATIVE_DEFAULT_CHARACTER;}
template<> inline void setDefaultChar(wchar_t& ch) {ch = REPLACEMENT_CHARACTER;}
template<> inline void setDefaultChar(ulong& ch) {ch = REPLACEMENT_CHARACTER;}

#define BIT7_MASK(c)	static_cast<uchar>((c) & 0x7F)
#define BIT8_MASK(c)	static_cast<uchar>((c) & 0xFF)
#define BIT16_MASK(c)	static_cast<ushort>((c) & 0xFFFF)
#define UTF16_MASK(c)	static_cast<wchar_t>((c) & 0xFFFF)

#define CONFIRM_ILLEGAL_CHAR(lhs)											\
	{																		\
		if(callback == 0 || callback->onFoundUnconvertableCharacter()) {	\
			setDefaultChar(lhs);											\
			callback = 0;													\
		} else																\
			return 0;														\
	}

#define CFU_ARGLIST											\
	uchar* dest, std::size_t destLength,					\
	const wchar_t* src, std::size_t srcLength /* = -1 */,	\
	IUnconvertableCharCallback* callback /* = 0 */

#define CTU_ARGLIST										\
	wchar_t* dest, std::size_t destLength,				\
	const uchar* src, std::size_t srcLength /* = -1 */,	\
	IUnconvertableCharCallback* callback /* = 0 */

#define CFU_CHECKARGS()				\
	assert(dest != 0 && src != 0);	\
	if(srcLength == -1)				\
		srcLength = wcslen(src)

#define CTU_CHECKARGS()				\
	assert(dest != 0 && src != 0);	\
	if(srcLength == -1)				\
		srcLength = strlen(reinterpret_cast<const char*>(src))

#define MAP_TABLE(offset, table)	\
	else MAP_TABLE_START(offset, table)

#define MAP_TABLE_START(offset, table)	\
	if(src[i] >= offset && src[i] < offset + countof(table)) dest[j] = table[src[i] - offset]

#define MAP_TABLE_0(table)	\
	if(src[i] < countof(table)) dest[j] = table[src[i]]

#define MAP_TABLE_SB(offset, table)	\
	else MAP_TABLE_SB_START(offset, table)

#define MAP_TABLE_SB_START(offset, table)	\
	if(src[i] >= offset && src[i] < offset + countof(table)) dest[i] = table[src[i] - offset]

#define MAP_TABLE_SB_0(table)	\
	if(src[i] < countof(table)) dest[i] = table[src[i]]

/// ϊłȂ̏R[obN
class IUnconvertableCharCallback {
public:
	/// fXgN^
	virtual ~IUnconvertableCharCallback() {}
	/**
	 *	t@Cǂݍݎ Unicode ɕϊłȂA
	 *	܂͕ۑɃlCeBuR[hɕϊłȂƂɌĂяoB
	 *	߂lɂ肻̕ǂ߂B
	 *	̃\bh1x̏1xĂяoȂ
	 *	@retval true	ϊłȂ̕ɕϊď𑱍s
	 *	@retval false	ǂݍ/ۑ𒼂ɒ~ (ϊ\bh0Ԃ)
	 */
	virtual bool onFoundUnconvertableCharacter() = 0;
};

/// GR[_
class Encoder : public Manah::Noncopyable {
	// RXgN^
protected:
	Encoder() {}
public:
	virtual ~Encoder() {}

	// \bh
public:
	/**
	 *	UTF-16 ϊ
	 *	@param dest			[out] ϊ
	 *	@param destLength	ϊ̒
	 *	@param src			ϊ
	 *	@param srcLength	ϊ̕
	 *	@param callback		ϊłȂ邽߂̃R[obNBnull ł悢
	 *	@return				ϊ̕
	 */
	virtual std::size_t fromUnicode(uchar* dest, std::size_t destLength,
		const wchar_t* src, std::size_t srcLength = -1, IUnconvertableCharCallback* callback = 0) = 0;
	/**
	 *	UTF-16 ɕϊ
	 *	@param dest			[out] ϊ
	 *	@param destLength	ϊ̕
	 *	@param src			ϊ
	 *	@param srcLength	ϊ̕
	 *	@param callBack		ϊłȂ邽߂̃R[obNBnull ł悢
	 *	@return				ϊ̕
	 */
	virtual std::size_t toUnicode(wchar_t* dest, std::size_t destLength,
		const uchar* src, std::size_t srcLength = -1, IUnconvertableCharCallback* callBack = 0) = 0;
	/// UCS 1lCeBuɕϊ̂ɕKvȍőoCgԂ
	virtual uchar getMaxNativeCharLength() const = 0;
	/// lCeBu1oCg UCS ɕϊ̂ɕKvȍő咷Ԃ (UTF-16 P)
	virtual uchar getMaxUCSCharLength() const = 0;
};


/// GR[_̃t@Ng
class EncoderFactory {
	// f[^^
public:
	typedef std::auto_ptr<Encoder>(*EncoderProducer)();
	typedef void(*CodePageDetector)(const uchar*, std::size_t, CodePage&, std::size_t&);

	// \bh
public:
	std::auto_ptr<Encoder>	createEncoder(CodePage cp);
	CodePage				detectCodePage(const uchar* src, std::size_t length, CodePage cp);
	void					enumCodePages(std::set<CodePage>& codePages) const;
	static EncoderFactory&	getInstance();
	CodePageDetector		getUnicodeDetector() const;
	bool					isCodePageForAutoDetection(CodePage cp) const;
	bool					isCodePageForReadOnly(CodePage cp) const;
	bool					isValidCodePage(CodePage cp) const;

	bool	registerCodePageForReadOnly(CodePage cp);
	bool	registerDetector(CodePage cp, CodePageDetector factoryMethod);
	bool	registerEncoder(CodePage cp, EncoderProducer factoryMethod);

	// f[^o
private:
	typedef std::map<CodePage, EncoderProducer>		EncoderMap;
	typedef std::map<CodePage, CodePageDetector>	DetectorMap;
	EncoderMap			registeredEncoders_;
	DetectorMap			registeredDetectors_;
	std::set<CodePage>	codePagesForReadOnly_;
};


#define BEGIN_ENCODER_DEFINITION()	namespace {

#define END_ENCODER_DEFINITION()	}

#define DEFINE_ENCODER_CLASS_(cp, name)							\
	class Encoder_##name : public Encoder {						\
	private:													\
		Encoder_##name();										\
	public:														\
		std::size_t	fromUnicode(CFU_ARGLIST);					\
		std::size_t	toUnicode(CTU_ARGLIST);						\
		uchar		getMaxNativeCharLength() const;				\
		uchar		getMaxUCSCharLength() const;				\
		static std::auto_ptr<Encoder> create() {				\
			return std::auto_ptr<Encoder>(new Encoder_##name);}	\
	};															\
	const bool res##name =										\
		EncoderFactory::getInstance().registerEncoder(cp, &Encoder_##name::create);

#define DEFINE_ENCODER_CLASS(cp, name, cch, ccp)						\
	DEFINE_ENCODER_CLASS_(cp, name)										\
	Encoder_##name::Encoder_##name() {}									\
	uchar Encoder_##name::getMaxNativeCharLength() const {return cch;}	\
	uchar Encoder_##name::getMaxUCSCharLength() const {return ccp;}

#define DEFINE_DETECTOR(cp, name)														\
	namespace {																			\
		void detectCodePage_##name(const uchar* buffer,									\
			std::size_t length, CodePage& result, std::size_t& convertableLength);		\
		const bool res##name =															\
			EncoderFactory::getInstance().registerDetector(cp, &detectCodePage_##name);	\
	}

#define REGISTER_READONLY_CODEPAGE(cp)	\
	const bool res##cp = EncoderFactory::getInstance().registerCodePageForReadOnly(cp)


// Windows ϊe[û܂܎gpGR[_
class WindowsEncoder : public Encoder {
private:
	WindowsEncoder(CodePage cp) : codePage_(cp) {
		if(!toBoolean(::IsValidCodePage(cp)))
			throw std::invalid_argument("Specified code page is not supported.");
	}
public:
	std::size_t fromUnicode(CFU_ARGLIST) {
		if(const int result = ::WideCharToMultiByte(codePage_, 0,
				src, static_cast<int>(srcLength), reinterpret_cast<char*>(dest), static_cast<int>(destLength), 0, 0))
			return result;
		return (callback == 0 || callback->onFoundUnconvertableCharacter()) ?
			::WideCharToMultiByte(codePage_, WC_DEFAULTCHAR,
				src, static_cast<int>(srcLength), reinterpret_cast<char*>(dest), static_cast<int>(destLength), 0, 0) : 0;
	}
	std::size_t toUnicode(CTU_ARGLIST) {
		if(const int result = ::MultiByteToWideChar(codePage_, MB_ERR_INVALID_CHARS,
				reinterpret_cast<const char*>(src), static_cast<int>(srcLength), dest, static_cast<int>(destLength)))
			return result;
		return (callback == 0 || callback->onFoundUnconvertableCharacter()) ?
			::MultiByteToWideChar(codePage_, 0,
				reinterpret_cast<const char*>(src), static_cast<int>(srcLength), dest, static_cast<int>(destLength)) : 0;
	}
	uchar getMaxNativeCharLength() const {CPINFO cpi; return toBoolean(::GetCPInfo(codePage_, &cpi)) ? cpi.MaxCharSize : 0;}
	uchar getMaxUCSCharLength() const {return 1;}
	friend class EncoderFactory;
private:
	const CodePage codePage_;
};


/// B̃CX^XԂ
inline EncoderFactory& EncoderFactory::getInstance() {static EncoderFactory instance; return instance;}

/// Unicode ̎ʊԂBo^ĂȂ null
inline EncoderFactory::CodePageDetector EncoderFactory::getUnicodeDetector() const {
	DetectorMap::const_iterator	it = registeredDetectors_.find(CPEX_UNICODE_AUTODETECT);
	return (it != registeredDetectors_.end()) ? it->second : 0;
}

/// ʂ̂߂̃R[hy[W
inline bool EncoderFactory::isCodePageForAutoDetection(CodePage cp) const {return registeredDetectors_.find(cp) != registeredDetectors_.end();}

/// lCeBuGR[h UCS ւ̕ϊT|[gȂR[hy[W
inline bool EncoderFactory::isCodePageForReadOnly(CodePage cp) const {return codePagesForReadOnly_.find(cp) != codePagesForReadOnly_.end();}

/// LȃR[hy[W
inline bool EncoderFactory::isValidCodePage(CodePage cp) const {
	return toBoolean(::IsValidCodePage(cp))
		|| isCodePageForAutoDetection(cp)
		|| registeredEncoders_.find(cp) != registeredEncoders_.end();
}

/**
 *	lCeBuGR[h UCS ւ̕ϊT|[gȂR[hy[W̓o^
 *	@param cp	R[hy[W
 *	@return		
 */
inline bool EncoderFactory::registerCodePageForReadOnly(CodePage cp) {
	return codePagesForReadOnly_.insert(cp).second;	// VC extended return
}

/**
 *	ʊ̓o^
 *	@param cp				R[hy[W
 *	@param factoryMethod	ʂs֐
 *	@return					
 */
inline bool EncoderFactory::registerDetector(CodePage cp, CodePageDetector factoryMethod) {
	assert(factoryMethod != 0);
	return registeredDetectors_.insert(std::make_pair(cp, factoryMethod)).second;	// VC extended return
}

/**
 *	GR[_̓o^
 *	@param cp				R[hy[W
 *	@param factoryMethod	GR[_쐬֐
 *	@return					
 */
inline bool EncoderFactory::registerEncoder(CodePage cp, EncoderProducer factoryMethod) {
	assert(factoryMethod != 0);
	return registeredEncoders_.insert(std::make_pair(cp, factoryMethod)).second;	// VC extended return
}

} // namespace Encodings
} // namespace Ascension

#endif /* ENCODER_H_ */

/* [EOF] */