#!/usr/bin/perl

use strict;
use Encode;

open(FH, "< jawiki-20060606-pages-articles.xml");
mkdir("wikipedia");

my @page;
my @index;
my ($id, $is_in_page);
$is_in_page = 0;

while(<FH>){
  
  my $line = $_;
  
  if($line =~ /<page>/){
    $is_in_page = 1;
  }
  push @page, $line if($is_in_page);
  
  if($line =~ /<\/page>/){
    my ($page);
    $is_in_page = 0;

    $page = join("", @page);
    Encode::from_to($page, "utf8", "euc-jp", Encode::FB_XMLCREF);

    if($page =~ /<id>([^<>]*)<\/id>/gis){
      $id = int $1;
    }
    push @index, $id;

    open(OUT, "> wikipedia/$id");
    print OUT $page;
    close(OUT);

    @page = ();
    $id = 0;
  }
}

close(FH);

open(INDEX, "> index.txt");
my $index = join "\n", @index;
print INDEX $index;
close(INDEX);
     
