##    extract_sentence.rb  ##
#
#wikipediaXMLΥǡ١
#ܸʸϤȴФޤ
#Copyright (C) 2006  hanaoka

#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.

#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

#
#Ȥ
#ruby extract_sentence < [wikipedia_xml]
#

require 'nkf'
require 'rexml/document'
include REXML
$KCODE = "e"



TEXTMIN = 10

def get_text(str)
  /^\s*\<text.*\>(.*)\<\/text\>/m=~ str
  if($1)
    $1
  else
    ""
  end
end

def remove_template(text)
  start = text.index(/\{\{/)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/(\{\{)|(\}\})/, left+2)
    break if(s == nil)
    if (text.index(/\{\{/, left+2) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+1)
      start = text.index(/\{\{/)
      left = start
      nest = 1
    end
  end
  text
end

def remove_table(text)
  start = text.index(/\{\|/)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/(\{\|)|(\|\})/, left+2)
    break if(s == nil)
    if (text.index(/\{\|/, left+2) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+1)
      start = text.index(/\{\|/)
      left = start
      nest = 1
    end
  end
  text
end

def remove_image(text)
  start = text.index(/\[\[\:?(()|(image))/i)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/(\[\[)|(\]\])/, left+2)
    break if(s == nil)
    if (text.index(/\[\[/, left+2) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+1)
      start = text.index(/\[\[\:?(()|(image))/i)
      left = start
      nest = 1
    end
  end
  text
end

def remove_gallery(text)
  start = text.index(/<gallery>/i)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/<gallery>|<\/gallery>/i, left+1)
    break if(s == nil)
    if (text.index(/<gallery>/i, left+1) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+9)
      start = text.index(/<gallery>/i)
      left = start
      nest = 1
    end
  end
  text
end

def remove_math(text)
  start = text.index(/<math>/i)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/<math>|<\/math>/i, left+1)
    break if(s == nil)
    if (text.index(/<math>/i, left+1) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+6)
      start = text.index(/<math>/i)
      left = start
      nest = 1
    end
  end
  text
end

def remove_code(text)
  start = text.index(/<code>/i)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/<code>|<\/code>/i, left+1)
    break if(s == nil)
    if (text.index(/<code>/i, left+1) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+6)
      start = text.index(/<code>/i)
      left = start
      nest = 1
    end
  end
  text
end

def remove_nowiki(text)
  start = text.index(/<nowiki>/i)
  left = start
  nest = 1
  while(start != nil)
    s = text.index(/<nowiki>|<\/nowiki>/i, left+1)
    break if(s == nil)
    if (text.index(/<nowiki>/i, left+1) == s)
      nest += 1
      left = s
    else
      nest -= 1
      left = s
    end
    if (nest == 0)
      text.slice!(start .. s+8)
      start = text.index(/<nowiki>/i)
      left = start
      nest = 1
    end
  end
  text
end

def extract(page_text)
  text = get_text(page_text)
#  print text
  if (text =~ /^#REDIRECT/i)
    ""
  else
    
    newtext = ""
    text = text.each do |l| #Ƭۥ磻ȥڡä Ѥߤ
      newtext << l.strip+"\n"
    end
    text = newtext

#    text = text.gsub(/^ *(.*)/) { $1 }
#    text = text.gsub(/ *\n/) { "\n" }
    text = Text::unnormalize(text)
    text = Text::unnormalize(text) # "&lt;"  & ʤɤ˥פƤ礬褦ʤΤǤк
    text = remove_template(text) #ƥץ졼Ⱦä
    text = remove_table(text) #ơ֥ä
    text = text.gsub(/\[\[\:?Category\:[^\]]*\]\]/i, "") #ƥä
    text = text.gsub(/\[\[(()|(special))\:[^\]]*\]\]/i, "") #̾ä
    text = text.gsub(/\[http\:\/\/[^ \]]*\s*([^\]]*)\]/i){ $1 } #̾դ
    text = text.gsub(/\[http\:\/\/[^\]]\]/i, "") #̵̾󥯾ä
    text = text.gsub(/http\:\/\/[^\s]*/i, "") #񤭳󥯾ä
    text = remove_image(text) #ä
    text = text.gsub(/\[\[([^\:\#\|\]]+)\]\]/) { $1 } #ñ
    text = text.gsub(/\[\[\#([^\:\|\]]+)\]\]/) { $1 } #Ʊڡ¾
    text = text.gsub(/\[\[[^\#\:]*\#([^\:\|\]]+)\]\]/) { $1 } #¾ܥ
    text = text.gsub(/\[\[([^\:]*\:)?([^ \]]*)( \([^\)]*\))?\s\|\]\]/) { $2 } #Ǹѥ
    text = text.gsub(/\[\[[^\|]*\|([^\]]*)\]\]/) { $1 } #̾
    text = text.gsub(/^=+ *([^=]+)=+/){ $1+"\n" } #Ф
    text = text.gsub(/^[ \*\#\:]+\s*/, "") #վ
    text = text.gsub(/^\s*\;\s*([^:\n]*)\s*\:\s*([^\n]*)\n/){ $1+"\n"+$2 +"\n"} #βվ
    text = text.gsub(/^\s*\;\s*/, "") #βվ񤭤ƤʤС
    text = text.gsub(/''+/, "") #Ĵ
    text = text.gsub(/^[ \:]*/, "") #
    text = text.gsub(/\~\~+/, "") #̾ä
    text = text.gsub(/((ISBN)|(RFC)) [0-9X]*/i, "") #ISBN, RFCä
    text = text.gsub(/^\s*\[\[[^\:]*\:[^\]]*\]\]\s*$/, "") #̾դ󥯤ΤߤʤԤϺ(̸󥯽äꤹΤ)
    text = text.gsub(/\[\[[^\:]*\:([^\]]*)\]\]/) { $1 } #ʸ̾դ
    
    text = text.gsub(/<\!\-\-[^\>]*\-\->/, "") #HTMLȽ
    text = text.gsub(/<\s*br[^>]*\/?>/i, "\n") ##ñʥؤб
    text = text.gsub(/<\/?\s*center[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*font[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*div[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*tt[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*small[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*s[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*u[^>]*\>/i, "")
    text = text.gsub(/<\/?\s*em[^>]*\>/i, "") ##
    text = remove_gallery(text) #꡼ä
    text = remove_math(text) #ä
    text = remove_code(text) #ɾä
    text = remove_nowiki(text) #nowikiä
    text = text.gsub(//, "\n") #ޥʸϤʬ
    text = text.gsub(/[^$]*[<>][^<>]*$/, "") #HTMLڤüޤԤϺ
    text = text.gsub(/^\-*/, "") #ä
#    text = text.gsub(/^ *(.*)/) { $1 }
#    text = text.gsub(/ *\n/) { "\n" }
#    text = text.gsub(/^.{0,10}?\n/, "")
    newtext = ""
    text.each("\n") do |l|
      if( l.strip.size > TEXTMIN )
        newtext << l.strip + "\n"
      end
    end
    text = newtext
    text
  end
end

data=""
flag = false
while (s=$stdin.gets)
  s = NKF.nkf('-eW', s)
#  print s
  if s=~ /\<\/page\>/
#    print data
    new_text = extract(data)
    print new_text
    flag=false    
    data=""
  elsif s=~ /\<page.*\>/ 
    flag=true
  elsif flag
    data<< s
  end
end
