#!/usr/bin/perl

use strict;

use Text::ChaSen;

sub main {

  open(INDEX, "< index.txt");
  my @index = <INDEX>;
  close(INDEX);
  open(OUT, "> morphologic.txt");

  my $res = Text::ChaSen::getopt_argv($0, '-j', '-F', '%m\t%M\t%y\t%P\t\n');

  foreach my $index (@index){
    my $source = &extract_text($index);
    my $sentence = ParseWiki::parse($source);
    foreach (@$sentence){
      my $str = Text::ChaSen::sparse_tostr($_);
      print OUT $str;
    }
  }

}

sub extract_text {

  my $index = shift;

  local $/;
  open(IN, "< wikipedia/$index");

  my $page = <IN>;

  close(IN); 
  my $text;
  if($page =~ m/<text xml:space="preserve">([^<>]*)<\/text>/gis){
    $text = $1;
  }

  return $text;
}

&main();


package ParseWiki;

use strict;

our $eucpre = qr{(?<!\x8F)};
our $eucpost = qr{
  (?=
   (?:[\xA1-\xFE][\xA1-\xFE])* # JIS X 0208  0ʸʾ³
   (?:[\x00-\x7F\x8E\x8F]|\z)  # ASCII, SS2, SS3 ޤϽü
   )
  }x;

our $ascii = '[\x00-\x7F]'; # 1Х EUC-JPʸ
our $twoBytes = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'; # 2Х EUC-JPʸ
our $threeBytes = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'; # 3Х EUC-JPʸ
our $character = "(?:$twoBytes|$threeBytes)"; # EUC-JPʸ

sub parse {

  my $source = shift;

  my @lines = split /\n/, $source;
  undef $source;

  my @sentences;

  while(1){
    my $line = shift @lines;

    if($line =~ /^\{\|/){
      my @sentence;
      &extract_table(\@sentence, \@lines);
      push @sentences, @sentence;
      undef @sentence;
    }else{
      $line = &eliminate_tag($line);
      my $sentence = &extract_sentence($line);
      push @sentences, @$sentence;
      undef $sentence;
    }

    last if($#lines == 0);
  }


  return \@sentences;
}


sub eliminate_tag {

  my $line = shift;

  #Asciiʸ̵
  $line =~ s/'{2,5}([^']*)'{2,5}/$1/g; #'
  $line =~ s/={2,4}([^=]*)={2,4}//g;
  $line =~ s/{{[^\}]*}}//g;
  $line =~ s/&lt; \s* small \s* &gt;(.*)?&lt; \s* \/small \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* tt \s* &gt;(.*)?&lt; \s* \/tt \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* nowiki \s* &gt;(.*)?&lt; \s* \/nowiki \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* math \s* &gt;(.*)?&lt; \s* \/math \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* center \s* &gt;(.*)?&lt; \s* \/center \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* sub \s* &gt;(.*)?&lt; \s* \/sub \s* &gt;/$1/gx;
  $line =~ s/&lt; \s* sup \s* &gt;(.*)?&lt; \s* \/sup \s* &gt;/$1/gx;
  $line =~ s/&lt; .* &gt;(.*)?&lt; .* &gt;/$1/gx;
  $line =~ s/&lt;!-- .* &gt;//gx;
  $line =~ s/\[\[wikipedia:[^\]\|]*\|([^\]]*)\]\]/$1/gi;
  $line =~ s/\[\[wikipedia:([^\]]*)\]\]/$1/gi;
  $line =~ s/\[\[Media:[^\]]*\]\]//gi;
  $line =~ s/\[\[[^\]]*\|([^\]]*)\]\]/$1/g;
  $line =~ s/\[\[([^\]]*)\]\]/$1/g;
  $line =~ s/^\**(.*)$/$1/g;
  $line =~ s/^\#*(.*)$/$1/g;
  $line =~ s/^\:*(.*)$/$1/g;
  $line =~ s/$ascii+//g;
  $line =~ s/\s+//g;

  return $line;

}

sub extract_sentence {

  my $sentence = shift;

  my @sentences;
  while($sentence =~ /($character*?$eucpre\Q\E$eucpost)/gs){
      push @sentences, $1;
  }

  return \@sentences;
}


sub extract_table {

  my ($sentence, $lines) = @_;


  my @multiple;
  while(1){
    my $line = shift @$lines;

    if($line =~ /^\{\|/){
      &extract_table($sentence, $lines);
    }elsif($line =~ /^\|-/){
      next;
    }elsif($line =~ /^\|[^\|]*\|(.*)/){
      $line = $1;
    }elsif($line =~ /^\|(.*)/){
      $line = $1;
    }elsif($line =~ /^\![^\|]*\|(.*)/){
      $line = $1;
    }elsif($line =~ /^\!(.*)/){
      $line = $1;
    }
    $line = &eliminate_tag($line);
    my $extract = &extract_sentence($line);
    push @$sentence, @$extract;

    undef @multiple;
    @multiple = ();
    push @multiple, $line;

    last if($line =~ /^\|\}/);
    last if($#{$lines} == 0);
  }

}


1;
