require 'cgi'

module HTML
  module Sanitize
    CharRefs = %w(
      AElig   Aacute  Acirc   Agrave  Alpha   Aring   Atilde  Auml    
      Beta    Ccedil  Chi     Dagger  Delta   ETH     Eacute  Ecirc   
      Egrave  Epsilon Eta     Euml    Gamma   Iacute  Icirc   Igrave  
      Iota    Iuml    Kappa   Lambda  Mu      Ntilde  Nu      OElig   
      Oacute  Ocirc   Ograve  Omega   Omicron Oslash  Otilde  Ouml    
      Phi     Pi      Prime   Psi     Rho     Scaron  Sigma   THORN   
      Tau     Theta   Uacute  Ucirc   Ugrave  Upsilon Uuml    Xi      
      Yacute  Yuml    Zeta    aacute  acirc   acute   aelig   agrave  
      alefsym alpha   amp     and     ang     apos    aring   asymp   
      atilde  auml    bdquo   beta    brvbar  bull    cap     ccedil  
      cedil   cent    chi     circ    clubs   cong    copy    crarr   
      cup     curren  dArr    dagger  darr    deg     delta   diams   
      divide  eacute  ecirc   egrave  empty   emsp    ensp    epsilon 
      equiv   eta     eth     euml    euro    exist   fnof    forall  
      frac12  frac14  frac34  frasl   gamma   ge      gt      hArr    
      harr    hearts  hellip  iacute  icirc   iexcl   igrave  image   
      infin   int     iota    iquest  isin    iuml    kappa   lArr    
      lambda  lang    laquo   larr    lceil   ldquo   le      lfloor  
      lowast  loz     lrm     lsaquo  lsquo   lt      macr    mdash   
      micro   middot  minus   mu      nabla   nbsp    ndash   ne      
      ni      not     notin   nsub    ntilde  nu      oacute  ocirc   
      oelig   ograve  oline   omega   omicron oplus   or      ordf    
      ordm    oslash  otilde  otimes  ouml    para    part    permil  
      perp    phi     pi      piv     plusmn  pound   prime   prod    
      prop    psi     quot    rArr    radic   rang    raquo   rarr    
      rceil   rdquo   real    reg     rfloor  rho     rlm     rsaquo  
      rsquo   sbquo   scaron  sdot    sect    shy     sigma   sigmaf  
      sim     spades  sub     sube    sum     sup     sup1    sup2    
      sup3    supe    szlig   tau     there4  theta   thetasymthinsp  
      thorn   tilde   times   trade   uArr    uacute  uarr    ucirc   
      ugrave  uml     upsih   upsilon uuml    weierp  xi      yacute  
      yen     yuml    zeta    zwj     zwnj
    )
    Basic = {
      'a' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
        'href'     => /\A(http:|https:|ftp:|mailto:)/,
        'target'   => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'blockquote' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'dl' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'dt' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'dd' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h1' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h2' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h3' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h4' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h5' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'h6' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'p' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'ul' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'ol' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'li' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'table' => {
        'border'   => /\A[0-9]+\z/n,
        'width'    => /\A[0-9]+\z/n,
        'cellpadding' => /\A[0-9]+\z/n,
        'cellspacing' => /\A[0-9]+\z/n,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'thead' => {
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'tfoot' => {
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'tbody' => {
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'tr' => {
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'th' => {
        'nowrap'   => /\Anowrap\z/,
        'width'    => /\A[0-9]+\z/n,
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'colspan'  => /\A[0-9]+\z/n,
        'rowspan'  => /\A[0-9]+\z/n,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'td' => {
        'nowrap'   => /\Anowrap\z/,
        'width'    => /\A[0-9]+\z/n,
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'colspan'  => /\A[0-9]+\z/n,
        'rowspan'  => /\A[0-9]+\z/n,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'col' => {
        :empty_tag => true,
        'width'    => /\A[0-9]+\z/n,
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'colgroup' => {
        'width'    => /\A[0-9]+\z/n,
        'align'    => /\Aleft|center|justify|middle|right\z/,
        'valign'   => /\Atop|center|middle|bottom\z/,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'pre' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'div' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'hr' => {
        :empty_tag => true,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'del' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'ins' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'img' => {
        :empty_tag => true,
        'id'       => /\A[a-zA-Z0-9_]+\z/n,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
        'src'      => /\A(http:|htts:|ftp:)?.+\.(gif|jpe?g|png|mng)\z/nm,
        'align'    => /\Atop|middle|bottom|left|right\z/,
        'border'   => /\A[0-9]+\z/n,
        'width'    => /\A[0-9]+\z/n,
        'height'   => /\A[0-9]+\z/n,
        'alt'      => /\A.+\z/n,
        'title'    => /\A.+\z/n,
      },
      'span' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'b' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'i' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'u' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      's' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'strong' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'em' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'code' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'br' => {
        :empty_tag => true
      },
      'ruby' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'rbc' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'rtc' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'rb' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'rp' => {
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
      'rt' => {
        'rbspan'   => /\A[0-9]+\z/n,
        'class'    => /\A[a-zA-Z0-9_ ]+\z/n,
      },
    }
    def self::sanitize(html,rule = Basic)
      stack = []
      s = ""
      html.scan(/(<!--.*?-->)|(<\/.+?>)|(<.+?>)|(&([a-zA-Z0-9]+|\#[0-9]+|\#x[0-9a-fA-F]+);)|([^<&]+)/nm) {
        m = $~
        if    m[1]  # comment
        elsif m[2]  # close tag
          if m[2]=~/<\/([a-zA-Z0-9:_]+)/n
            if $1==stack.last
              stack.pop
              s << "</" + $1 + ">"
            else
              s << CGI::escapeHTML(m[2])
            end
          else
            s << CGI::escapeHTML(m[2])
          end
        elsif m[3]  # tag
          attr = {}
          if m[3]=~/<([a-zA-Z0-9:_]+)(.*)>/nm
            tag  = $1
            rest = $2
            tag_rule = rule[tag]
            if tag_rule
              rest.scan(/([a-zA-Z0-9:_]+)\s*=\s*"([^"]*)"/nm) {
                attrname = $1
                value = CGI::unescapeHTML($2)
                if tag_rule=rule[tag]
                  if tag_rule[attrname] && tag_rule[attrname].match(value)
                    attr[attrname] = value
                  end
                end
              }
              unless rule[tag][:empty_tag]
                stack << tag
              end
              s << "<" << tag
              attr.each {|k,v|
                s << " " << k << "=" << '"' << CGI::escapeHTML(v) << '"'
              }
              if rule[tag][:empty_tag]
                s << "/>"
              else
                s << ">"
              end
            else
              s << CGI::escapeHTML(m[3])
            end
          else
            s << CGI::escapeHTML(m[3])
          end
        elsif m[4]  # charref
          case m[5]
          when /^\#([0-9]+)/n
            s << m[4] if $1.to_i < 0x10ffff
          when /^\#([0-9a-fA-F]+)/n
            s << m[4] if $1.hex < 0x10ffff
          else
            if CharRefs.include? m[5]
              s << m[4]
            elsif CharRefs.include? m[5].downcase
              s << m[4].downcase
            else
              s << CGI::escapeHTML(m[4])
            end
          end
        elsif m[6]  # text
          s << CGI::escapeHTML(m[0])
        end
      }
      s
    end
  end # Sanitize
end # HTML
