/*
 * File: encoding.c
 *
 * Copyright (C) 2003 Kiyo <kiyo@teki.jpn.ph>
 * This code referred to Mr. Robert Thomson's code. and created it.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */

/*
 * guess encoding charset and convert
 */

#include <config.h>
#include <ctype.h>
#include <string.h>
#include <gtk/gtk.h>
#include <errno.h>    /* for iconv error codes */
#include <iconv.h>

#include "intl.h"
#include "misc.h"
#include "encoding.h"

//#define DEBUG_LEVEL 10
#include "debug.h"

#define MAX_HEADER_LENGTH 10240

static gchar *dw_charset;
gboolean detect_ja_charset;
gboolean has_latin1_charset;

typedef struct {
   gchar *lang;
   gchar *charset;
} lang_charset;

static lang_charset lang_tbl[59] = {
   {"ZH_TW",	"BIG5"},
   {"ZH_HK",	"BIG5HKSCS"},
   {"ZH_CN",	"GBK"},
   {"YI",	"CP1255"},
   {"VI",	"TCVN"},
   {"UR",	"CP1256"},
   {"UK",	"KOI8-U"},
   {"TURKISH",	"ISO8859-9"},
   {"TT",	"TATAR-CYR"},
   {"TR",	"ISO8859-9"},
   {"TG",	"KOI8-C"},
   {"TA",	"TSCII-0"},
   {"SR_SP",	"ISO8859-2"},
   {"SR",	"ISO8859-5"},
   {"SQ",	"ISO8859-2"},
   {"SP",	"ISO8859-5"},
   {"SL",	"ISO8859-2"},
   {"SK",	"ISO8859-2"},
   {"SH",	"ISO8859-2"},
   {"SE_NO",	"UTF-8"},
   {"SERBOCROATIAN",	"ISO8859-2"},
   {"RU_UA",	"KOI8-U"},
   {"RU_RU",	"KOI8-R"},
   {"RUSSIAN",	"ISO8859-5"},
   {"RUMANIAN",	"ISO8859-2"},
   {"RU",	"KOI8-R"},
   {"RO",	"ISO8859-2"},
   {"POSIX",	"C"},
   {"POLISH",	"ISO8859-2"},
   {"PL",	"ISO8859-2"},
   {"MT",	"ISO8859-3"},
   {"MK",	"ISO8859-5"},
   {"LO",	"MULELAO-1"},
   {"KO",	"EUCKR"},
   {"KA",	"GEORGIAN-ACADEMY"},
   {"JP_JP",	"EUCJP"},
   {"JA",	"EUCJP"},
   {"IW",	"ISO8859-8"},
   {"IU",	"NUNACOM-8"},
   {"HY",	"ARMSCII-8"},
   {"HU",	"ISO8859-2"},
   {"HR",	"ISO8859-2"},
   {"HI",	"ISCII-DEV"},
   {"HE",	"ISO8859-8"},
   {"GREEK",	"ISO8859-7"},
   {"FA",	"UTF-8"},
   {"EO",	"ISO8859-3"},
   {"EL",	"ISO8859-7"},
   {"EE",	"ISO8859-4"},
   {"CZ",	"ISO8859-2"},
   {"CS",	"ISO8859-2"},
   {"CROATIAN",	"ISO8859-2"},
   {"CHINESE-T",	"EUCTW"},
   {"CHINESE-S",	"EUCCN"},
   {"BE",	"CP1251"},
   {"AZ",	"ISO8859-9E"},
   {"AR",	"ISO8859-6"},
   {"AM",	"UTF-8"},
   {"A3",	"KOI8-C"},
};
#define LANG_TBL_N 59

static gchar *Encoding_get_charset_from_lang(const gchar *lang) {
   int i;
   gchar u_lang[64];

   strncpy(u_lang, lang, 64);
   g_strup(u_lang);
   if(strstr(u_lang, "EURO")) return g_strdup("ISO8859-15");
   for(i = 0; i < LANG_TBL_N; i++) {
      if(strncmp(u_lang, lang_tbl[i].lang, strlen(lang_tbl[i].lang)) == 0)
         return g_strdup(lang_tbl[i].charset);
   }
   has_latin1_charset = TRUE;
   return g_strdup("ISO8859-1");
}

void a_Encoding_set_DW_lang(const gchar *lang) {
   detect_ja_charset = FALSE;
   has_latin1_charset = TRUE;
   if(!lang) {
      dw_charset = g_strdup("ISO8859-1");
      return;
   }
   if(g_strncasecmp(lang, "JA", 2) == 0) {
      detect_ja_charset = TRUE;
      has_latin1_charset = FALSE;
   }
   dw_charset = (strchr(lang, '.'))
           ? a_Encoding_fix_charset(g_strdup(strchr(lang, '.')+1))
           : Encoding_get_charset_from_lang(lang);
}

gchar *a_Encoding_get_DW_charset() { return dw_charset; }

/* If font has latin1, return TRUE. */
gboolean a_Encoding_has_latin1_charset() { return has_latin1_charset; }

/* fix name Html_charset to iconv_charset. for example "x-sjis" to "SJIS"... */
gchar *a_Encoding_fix_charset(gchar *charset) {
   if(!charset) return NULL;
   g_strup(charset);
   if(strncmp(charset, "X-", 2) == 0) {
      gchar *old_charset = charset;
      charset = g_strdup(charset+2);
      g_free(old_charset);
   }
   if(strncmp(charset, "SHIFTJIS", 8) == 0) {
      g_free(charset);
      return g_strdup("SJIS");
   }
   if(strncmp(charset, "UNICODE-1-1-UTF-8",17) == 0) {
      g_free(charset);
      return g_strdup("UTF-8");
   }
   return charset;
}

/* auto detect Japanese charset. -- SJIS,EUC-JP,ISO-2022-JP */
gchar* Encoding_detect_japanese_charset(gchar *buf, gint bufsize) {
   int i, got_sjis = 0, got_euc = 0, got_jis = 0;
   if (!buf) return  g_strdup("SJIS");
   
   /* check CRLF code. */
   if (strstr(buf, "\r")) got_sjis += 2;
         
   for (i=0; i<bufsize; i++) {
      if (got_sjis > got_euc && got_sjis >= 40) return g_strdup("SJIS");
      else if (got_euc > got_sjis && got_euc >= 40) return g_strdup("EUC-JP");
      else if (got_jis > 3) return g_strdup("ISO-2022-JP");
      if (buf[i] == 0x1b) {
         if (buf[i+1] == '$' || buf[i+1] == '(') got_jis++;
         else got_jis--;
      } else if (IS_ASCII_CHAR(buf[i])) continue;
      DEBUG_MSG(10, "e=%d s=%d j=%d %d\n", got_euc, got_sjis, got_jis, i);
      if (IS_SJIS_CHAR1(buf[i])) {
         if(IS_SJIS_CHAR2(buf[i+1])) {
            i++;
            got_sjis += 2;
            continue;
         } else got_euc += 2;
      }
      if (IS_EUC_CHAR(buf[i])) {
         if (IS_EUC_CHAR(buf[i+1])) {
            got_euc += 2;
            i++;
         } else got_sjis += 2;
      }
      if (i >= 1)
         if (IS_HANKANA_CHAR(buf[i - 2])
               && IS_HANKANA_CHAR(buf[i - 1])
               && IS_HANKANA_CHAR(buf[i])) {
            got_sjis++;
            if (IS_HANKANA_CHAR(buf[i+1])) got_sjis++;
      }
   }

   DEBUG_MSG(10, _("charset detect may mistake...\n"));
   if (got_jis > 1)        return g_strdup("ISO-2022-JP");
   if (got_sjis > got_euc) return g_strdup("SJIS");
   else if (got_euc >= 1)  return g_strdup("EUC-JP");
   
   /* check CRLF code... */
   if(strstr(buf, "\r")) return g_strdup("SJIS");
   else return g_strdup("EUC-JP");
}

/** get encoding charset.
 * buf     : check target buffer.
 * bufsize : buf's size.
 * usemeta : if use meta tag, this is TRUE. and if using meta tag, return TRUE.
 * return  : charset
 */
gchar* a_Encoding_get_encoding(gchar *buf, gint bufsize, gboolean *usemeta) {
   int i, utf_count = 0;

   //*usemeta = FALSE;//for debug

   /* use <meta> tag */
   if(*usemeta){
      char  *metaBuf = NULL;
      size_t metaBufSize;
      if(bufsize > MAX_HEADER_LENGTH) metaBufSize = MAX_HEADER_LENGTH;
      else metaBufSize = bufsize;
      metaBuf = g_strndup(buf, metaBufSize);
      metaBuf[metaBufSize - 1] = '\0';

      if(metaBuf && a_Misc_stristr(metaBuf,"<meta"))
         if(a_Misc_stristr(metaBuf,"http-equiv=\"content-type\"")) {
            char *head_end     = a_Misc_stristr(metaBuf, "</head>");
            char *meta_charset = a_Misc_stristr(metaBuf,"charset=");
            if(meta_charset && head_end && (meta_charset < head_end)) {
               char* charset = NULL;
               meta_charset += sizeof("charset=") - 1;
               if((strpbrk(meta_charset, "\"'> ") - meta_charset) > 0) {
                  charset = g_strndup(meta_charset,
                        (strpbrk(meta_charset, "\"'> ") - meta_charset));
               } else {
                  DEBUG_HTML_MSG("not found ' or \" in <meta> tag!\n");
                  charset = NULL;
               }
               g_free(metaBuf);
               if(charset) return a_Encoding_fix_charset(charset);
            } else {
               DEBUG_HTML_MSG("<meta> outside of <head>!"
                     " or not found charset.\n");
            }
         }
      *usemeta = FALSE;
   }

   /* skip ascii char */
   {
      int old_bufsize = bufsize;
      for(i=0; i < old_bufsize; i++) {
         if(IS_ASCII_CHAR(buf[0])
               && buf[0] != 0x1b) { //for ISO-2022-JP detect
            buf++;
            bufsize--;
         }else
            break;
      }
      if(bufsize == 0) {
         DEBUG_MSG(10, _("all ascii code\n"));
         return g_strdup("ISO-8859-1");
      }
   }

   /* Check UTF-8 code. */
   for (i=0; i<bufsize; i++) {
      if (IS_UTF8_FIRST_CHAR(buf[i])) {
         if (IS_ASCII_CHAR(buf[i+1])) {
            utf_count -= 8;
         } else if (IS_UTF8_2BYTE_FIRST_CHAR(buf[i])) {
            if (IS_UTF8_NON_FIRST_CHAR(buf[i+1])
                  && (IS_UTF8_FIRST_CHAR(buf[i+2])
                     || IS_ASCII_CHAR(buf[i+2])))
               utf_count++;
            else
               utf_count -= 8;
         } else if (IS_UTF8_3BYTE_FIRST_CHAR(buf[i])) {
            if (IS_UTF8_NON_FIRST_CHAR(buf[i+1])
                  && IS_UTF8_NON_FIRST_CHAR(buf[i+2])
                  && (IS_UTF8_FIRST_CHAR(buf[i+3])
                     || IS_ASCII_CHAR(buf[i+3])))
               utf_count+=2;
            else
               utf_count -= 8;
         } else if (IS_UTF8_4BYTE_FIRST_CHAR(buf[i])
               && IS_UTF8_NON_FIRST_CHAR(buf[i+1])
               && IS_UTF8_NON_FIRST_CHAR(buf[i+2])
               && IS_UTF8_NON_FIRST_CHAR(buf[i+3])
               && (IS_UTF8_FIRST_CHAR(buf[i+4])
                  || IS_ASCII_CHAR(buf[i+4])))
            utf_count+=3;
         else utf_count -= 8;
         if (utf_count <= -16 || utf_count >= 16) break;
      }
   }
   if (utf_count >= 9) {
      g_print("UTF-8");
      return  g_strdup("UTF-8");
   }

   if(detect_ja_charset) return Encoding_detect_japanese_charset(buf, bufsize);
   else return DW_CHARSET;
}

/* [hack] iconv with error recovery */
size_t a_Encoding_iconv(iconv_t *cd,
      char **inbuf, size_t *inbytesleft,
      char **outbuf, size_t *outbytesleft, const gint *max_char_growth) {
   size_t error_count = 0;
   
   if (iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft)
       == (size_t)-1 && errno != EINVAL) {
      DEBUG_MSG (10,_("unable to fully convert between character encodings\n"));
      /* This just skips past unconvertable characters, putting white square
	   * in the output, then retries the conversion. This is a hack, 
	   * but it seems like the best course of action in the circumstances. */
      while ((int)*inbytesleft > 0 && (int)*outbytesleft > 0 && errno == EILSEQ) {
         error_count++;
         (*outbuf)[0]   = '['; /* white square */
         (*outbuf)[1]   = ']';
         *outbuf       += 2;
         *outbytesleft -= 2;
         *inbuf        += 2;//*max_char_growth;
         *inbytesleft  -= 2;//*max_char_growth;
         if ((int)*inbytesleft > 0 && (int)*outbytesleft > 0)
            iconv(cd, inbuf, inbytesleft, outbuf, outbytesleft);
      }
   }
   *outbuf[0] = '\0';
   return error_count;
}


/* Charset Transrate. for the part which gives priority to speed.*/
gchar* a_Encoding_Convert_Raw(iconv_t* it, const gchar* buf, size_t bufsize, gint char_growth)
{
   size_t outLeft = bufsize * char_growth;
   size_t outSize = outLeft + 1;
   char *outStart = (char *)g_malloc(outSize);
   char *in  = (char *)buf;
   char *out = outStart;

   if(!it) {
      g_warning(_("no support enc\n"));
      it = iconv_open(DILLO_CHARSET, DILLO_CHARSET);
   }
   a_Encoding_iconv(it, &in, &bufsize, &out, &outLeft, &char_growth);
   //DEBUG_MSG(10,"free %d bytes.\n", outSize - outLeft);
   return outStart = g_realloc(outStart, outSize - outLeft);
}

/* Charset Transrate. for simple text or string.*/
gchar* a_Encoding_Convert(const gchar* from, const gchar* to, const gchar* buf, size_t bufsize)
{
   iconv_t *it = iconv_open(to,from);
   char *outStart = a_Encoding_Convert_Raw(it, buf, bufsize,
         (strncmp(from, "UTF-8", 5) == 0)? 3 : 2);
   //DEBUG_MSG(10,"%s\n",from);
   iconv_close(it);
   return outStart;
}

#ifndef USE_MBLEN_MACRO
/* mblen() only for UTF-8 */
gint a_Encoding_mblen(gchar *str) {
   if (IS_UTF8_3BYTE_FIRST_CHAR(str[0])) {
      if (IS_UTF8_NON_FIRST_CHAR(str[1]))
         if (IS_UTF8_NON_FIRST_CHAR(str[2]))
            return 3;
   } else if(IS_UTF8_2BYTE_FIRST_CHAR(str[0])) {
      if (IS_UTF8_NON_FIRST_CHAR(str[1]))
         return 2;
   } else if(IS_UTF8_4BYTE_FIRST_CHAR(str[0])) {
      if (IS_UTF8_NON_FIRST_CHAR(str[1]))
         if (IS_UTF8_NON_FIRST_CHAR(str[2]))
            if (IS_UTF8_NON_FIRST_CHAR(str[3]))
               return 4;
   }
   return 0;
}
#endif

#if 0
/**
 * a_Encoding_count_japanese_word
 * 
 * Return the number of characters of the japanese word
 * on the head of the argument string.
 */
gint a_Encoding_count_japanese_word(gchar *str) {
   int wordnum = 0;

   if(IS_EUC_HIRA_CHAR(str[0])){
      while(IS_EUC_HIRA_CHAR(str[wordnum])
            && IS_EUC_CHAR(str[1 + wordnum])) {
         wordnum += 2;
      }
   } else if(IS_EUC_ZENKANA_CHAR(str[0])
         || (unsigned char)(str[0]) == 0xa1) {
      while((IS_EUC_ZENKANA_CHAR(str[wordnum])
               && IS_EUC_CHAR(str[1 + wordnum]))
            || ((unsigned char)str[wordnum] == 0xa1//tyo-on kigou
               && (unsigned char)str[1 + wordnum] == 0xbc)) {//tyo-on kigou
         wordnum += 2;
      }
      if(wordnum == 0) //is normal kigou
         wordnum = 2;
   } else if(IS_EUC_KANJI_CHAR(str[0])){
      while(IS_EUC_KANJI_CHAR(str[wordnum])
            && IS_EUC_CHAR(str[1 + wordnum])) {
         wordnum += 2;
      }
   } else if(IS_EUC_HANKANA_CHAR(str[0])){
      while(IS_EUC_HANKANA_CHAR(str[wordnum])
            && IS_EUC_CHAR(str[1 + wordnum])) {
         wordnum += 2;
      }
   } else if(IS_EUC_ZENASCII_CHAR(str[0])) {
      while(IS_EUC_ZENASCII_CHAR(str[wordnum])
            && IS_EUC_CHAR(str[1 + wordnum])) {
         wordnum += 2;
      }
   } else if(IS_EUC_CHAR(str[0]))
      wordnum += 2; //is other EUC code.

   if(wordnum > MAX_WORD_LENGTH)
      return 2;
   else return wordnum;
}
#endif

/* vim: set ts=3 sw=3 sts=3 expandtab:*/
