#include "encoding_sensor.h"
#include "../../transcoders/encoding_name.h"

using namespace aka2;

namespace {

  struct char_mark ucs2be_cr =        { "\x00\x0d", 2 };
  struct char_mark ucs2be_lf =        { "\x00\x0a", 2 };
  struct char_mark ucs2be_space =     { "\x00\x20", 2 };
  struct char_mark ucs2be_openquote = { "\x00<",    2 };
  struct char_mark ucs2le_cr =        { "\x0d\x00", 2 };
  struct char_mark ucs2le_lf =        { "\x0a\x00", 2 };
  struct char_mark ucs2le_space =     { "\x20\x00", 2 };
  struct char_mark ucs2le_openquote = { "<\x00",    2 };

  struct char_mark ucs4be_cr =        { "\x00\x00\x00\x0d", 4 };
  struct char_mark ucs4be_lf =        { "\x00\x00\x00\x0a", 4 };
  struct char_mark ucs4be_space =     { "\x00\x00\x00\x20", 4 };
  struct char_mark ucs4be_openquote = { "\x00\x00\x00<",    4 };

  struct char_mark ucs4le_cr =        { "\x0d\x00\x00\x00", 4 };
  struct char_mark ucs4le_lf =        { "\x0a\x00\x00\x00", 4 };
  struct char_mark ucs4le_space =     { "\x20\x00\x00\x00", 4 };
  struct char_mark ucs4le_openquote = { "<\x00\x00\x00",    4 };

  bool compare_bytes(const char *chars, const struct char_mark &mark) {
    return strncmp(chars, mark.mark_, mark.length_) == 0;
  }

  const char *skip_spaces(const char *ch, const char *end) {
    while ((*ch == ' ') && (ch < end)) { 
      ++ch;
    }
    return ch;
  }
}

std::string aka2::sense_encoding(const char *str, size_t length) {

  const char *end = str + length;

  // Find BOM's.
  if (length >= 4) { // Try to check UCS-4
    if (compare_bytes(str, ucs4be_bom))
      return "UCS-4";
    if (compare_bytes(str, ucs4le_bom))
      return "UCS-4";
  }    

  if (length >= 2) { // Try to check UCS-2
    if (compare_bytes(str, ucs2be_bom))
      return "UTF-16";
    if (compare_bytes(str, ucs2le_bom))
      return "UTF-16";
  }

  if (length >= 3) { // Try to check UTF-8 char_mark.
    if (compare_bytes(str, utf8_bom))
      return "UTF-8";
  }

  if (length >= 5) { // Try to check UTF-7 char_mark.
    if (compare_bytes(str, utf7_bom))
      return "UTF-7";
  }

  // Find unicode-specific characters.
  if (length >= 4) { // Try to check UCS-4
    if (compare_bytes(str, ucs4be_cr) ||
	compare_bytes(str, ucs4be_lf) ||
	compare_bytes(str, ucs4be_space) ||
	compare_bytes(str, ucs4be_openquote))
      return "UCS-4BE";
    
    if (compare_bytes(str, ucs4le_cr) ||
	compare_bytes(str, ucs4le_lf) ||
	compare_bytes(str, ucs4le_space) ||
	compare_bytes(str, ucs4le_openquote))
      return "UCS-4LE";
  }
  
  if (length >= 2) { // Try to check UCS-2
    if (compare_bytes(str, ucs2be_cr) ||
	compare_bytes(str, ucs2be_lf) ||
	compare_bytes(str, ucs2be_space) ||
	compare_bytes(str, ucs2be_openquote))
      return "UCS-2BE";

    if (compare_bytes(str, ucs2le_cr) ||
	compare_bytes(str, ucs2le_lf) ||
	compare_bytes(str, ucs2le_space) ||
	compare_bytes(str, ucs2le_openquote))
      return "UCS-2LE";
  }

  // char_mark search finished.
  // UCS-2/UCS-4-collegues must be already found.
  // string sequence is assumed to be consisted with 1-byte units.

  // check if document has xml declaration.

  const void *ptrpos = memchr(str, '<', end - str);

  if (ptrpos == 0)
    return std::string(); // Could not tell encoding.

  str = static_cast<const char*>(ptrpos);
  ++str; // points to character after '<'.

  // Beginning of something found.
    
  // '<' may be  at the end of given character sequence.
  // If the buffer does not have 4 characters for the following '?xml', 
  // XML declaration is not in buffer..
  // Give up and return.
  if ((end - str) < 4)
    return std::string();

  // Next, we'll try to find "?xml".
  if (strncmp(str, "?xml", 4) != 0) {
    // A sequence beginning of '<' is not XML declaration.
    // This sequence does not have XML declaration.
    // We'll return "UTF-8".
    return "UTF-8";
  }

  // We got the beginning of XML declaration here.
  // Get the next character position.
  str += 4;

  // find '?' to close XML declaration.
  ptrpos = memchr(str, '?', end - str);
  if (ptrpos == 0) {
    // Close position of XML declaration not found.
    // Not sufficient character sequence.  Give up.
    return std::string();
  }

  const char *closepos = static_cast<const char*>(ptrpos);
  // '?' to close declaration found. 
  ++closepos; // points to a character after '?'.

  // Next character must be '>' to close XML declaration.
  // Check character sequence has enough length.
  if (closepos == end) {
    // '?' is at the end of character sequence.
    // Give up.
    return std::string();
  }

  // Check if XML-declaration-closing '>' is at current_pos.
  if (*closepos != '>') {
    // Wrong character sequence.
    // XML parser will make it a parse error.
    // Just return encoding as "UTF-8".
    return "UTF-8";
  }

  // Here, we've confirmed XML declaration is in given character sequence.
  // Find "encoding", and get given encoding name.
  for (; str < closepos - 8; ++str) {
    if (strncmp(str, "encoding", 8) == 0)
      break;
  }

  if (str == (closepos - 8)) {
    // "encoding=" is not specified.
    // return "UTF-8".
    return "UTF-8";
  }
  
  // move pointer for "encoding".
  str += 8;
  
  // skip space between "encoding" and "=".
  str = skip_spaces(str, end);
  if (str == end) {
    // 'encoding' without '='.
    // wrong sequence.
    return "UTF-8";
  }
  
  // "=" must be here.
  if (*str != '=') {
    // wrong seqence.
    return "UTF-8";
  }
  
  // skip space between "=" and real encoding name.
  str = skip_spaces(++str, end);
  if (str == 0) {
    // wrong sequence.
    return "UTF-8";
  }

  if ((*str != '\'') && (*str != '\"')) {
    // Wrong format.
    // Parser will make it a parse error.
    // Just return 'UTF-8'.
    return "UTF-8";
  }

  // Save the encoding name starting quote character.
  const char encoding_quote = *str;
  
  // The next character sequence is the real encoding name we want!
  ++str;
  
  // Find the end of encoding name.
  // It should be terminated with ' or ".
  const void *encoding_end_ptr = memchr(str, encoding_quote, end - str);
  if (encoding_end_ptr == 0) {
    // encoding="" is not closed.
    // Parser will make it a parser error.
    // Just return UTF-8.
    return "UTF-8";
  }
  
  const char *endquote = static_cast<const char *>(encoding_end_ptr);
  
  size_t encoding_name_length = endquote - static_cast<const char *>(str);
  std::string encoding_name = std::string(str, encoding_name_length);
  
  /* encoding name must consist of latin letters. */
  for (std::string::size_type pos = 0; pos < encoding_name.size(); ++pos) {
    const char ch = encoding_name[pos];
    if (('A' <= ch) && (ch <= 'Z'))
      continue;
    if (('a' <= ch) && (ch <= 'z'))
      continue;
    if (('0' <= ch) && (ch <= '9'))
      continue;
    if ((ch == '-') || (ch == '_') || (ch == '.'))
      continue;
    // Wrong encoding name.
    // tell a parser to try parsing as UTF-8. 
    return "UTF-8";
  }
  return encoding_name; 
}
