#!/usr/bin/env perl
#
# Copyright (c) 2007 Tatyana.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#

use strict;
use DirHandle;
use FileHandle;
use NKF;
use Text::Kakasi;

#use encoding "utf-8";

if (scalar(@ARGV) != 2) {
    printf STDERR "usage: y2m.pl <source.dir> <output.xml>\n";
    exit;
}

my $kakasi = Text::Kakasi->new('-ieuc', '-JH', '-KH');

my $dn_input = shift(@ARGV);
my $fn_output = shift(@ARGV);

my $g_tablemode = 0;
my $g_premode = 0;

my %data;

my $fh = new FileHandle;
my $dh = new DirHandle;

{
    my %tmp;
    $fh->open("./category.csv") or die "Could not open \"./category.csv\"";
    foreach (<$fh>) {
	chomp;
	my ($pagename, $b, @categories) = split(/,/, $_);
	foreach (@categories) {
	    push(@{$data{$pagename}{'category'}}, $_);
	    $tmp{$_}="x";
	}
    }
    $fh->close();

    foreach my $key (keys %tmp) {
	$data{"Category:$key"}{'content'} = "${key}に関するページの一覧です。";
    }
}

#exit;

$dh->open($dn_input) or die "Could not open source directory \"$dn_input\"";
foreach ($dh->read()) {
    my $fn_input = $_;
    unless ($fn_input =~ /([0-9A-F]+)\.txt$/) {next;}
    my $pagename = pack("H*", $1);
    #$pagename =~ s/\//_/g;

    my @buff;
    $fh->open("$dn_input/$fn_input") or die "Could not open source file \"$dn_input/$fn_input\"";
    foreach (<$fh>) {
	chomp;
	foreach (&phase1(\$_)) {
	    push(@buff, $_);
	}
    }
    $fh->close();

    foreach (@buff) {
	s/&/&amp;/g;
	s/</&lt;/g;
	s/>/&gt;/g;
	$data{$pagename}{'content'} .= "$_\n";
    }

    my $tmp = $kakasi->get(nkf(("-e"), $pagename));
    $tmp = nkf(("-w"), $tmp);
    $tmp =~ s/が/か/g; $tmp =~ s/ぎ/き/g; $tmp =~ s/ぐ/く/g; $tmp =~ s/げ/け/g; $tmp =~ s/ご/こ/g;
    $tmp =~ s/ざ/さ/g; $tmp =~ s/じ/し/g; $tmp =~ s/ず/す/g; $tmp =~ s/ぜ/せ/g; $tmp =~ s/ぞ/そ/g;
    $tmp =~ s/だ/た/g; $tmp =~ s/ぢ/ち/g; $tmp =~ s/づ/つ/g; $tmp =~ s/で/て/g; $tmp =~ s/ど/と/g;
    $tmp =~ s/ば/は/g; $tmp =~ s/び/ひ/g; $tmp =~ s/ぶ/ふ/g; $tmp =~ s/べ/へ/g; $tmp =~ s/ぼ/ほ/g;
    $tmp =~ s/ぱ/は/g; $tmp =~ s/ぴ/ひ/g; $tmp =~ s/ぷ/ふ/g; $tmp =~ s/ぺ/へ/g; $tmp =~ s/ぽ/ほ/g;
#    $tmp =~ s/ぁ/あ/g; $tmp =~ s/ぃ/い/g; $tmp =~ s/ぅ/う/g; $tmp =~ s/ぇ/え/g; $tmp =~ s/ぉ/お/g;
#    $tmp =~ s/っ/つ/g; $tmp =~ s/ゃ/や/g; $tmp =~ s/ゅ/ゆ/g; $tmp =~ s/ょ/よ/g; #silly..
    if ($pagename ne $tmp) {
	$data{$pagename}{'yomi'} = $tmp;
    } else {
	$data{$pagename}{'yomi'} = "";
    }
#    print "$pagename,$data{$pagename}{'yomi'},\n";
}
$dh->close();

#exit;

$fh->open(">$fn_output") or die "Could not open destination file \"$fn_output\"";
$fh->print("<mediawiki>\n");
my $n=0;
foreach my $key (keys %data) {
    $fh->print("<page>\n<title>$key</title>\n<id>$n</id>\n<revision>\n<timestamp>2007-04-01T00:00:00Z</timestamp>\n<contributor><username>WikiSysop</username></contributor>\n<text xml:space=\"preserve\">\n");
    $fh->print($data{$key}{'content'});

    $fh->print("\n");
    if ($data{$key}{'yomi'} ne "") {
	foreach (@{$data{$key}{'category'}}) {
	    $fh->print("[[Category:$_|$data{$key}{'yomi'}]]\n");
	}
    } else {
	foreach (@{$data{$key}{'category'}}) {
	    $fh->print("[[Category:$_]]\n");
	}
    }

    $fh->print("</text>\n</revision>\n</page>\n");
    $n++;
}
$fh->print("</mediawiki>\n");
$fh->close();


sub phase1()
{
    my $data = shift;

    if ($g_premode == 0) {

    #コンマ(,)を行頭に書くと、テーブル(表)が作れます。
    if ($$data =~ /^,(.*)/) {
	my @tmp = split(/,/, $1);
	if ($g_tablemode == 0) {
	    $g_tablemode = 1;
	    return ("{| border=\"1\" cellpadding=\"10\"", &phase1($data));
	}
	my @ret;
	push(@ret, "|-");
	foreach (@tmp) {
	    push(@ret, "|". &phase2(\$_));
	}
	return @ret;
    } else {
	if ($g_tablemode != 0) {
	    $g_tablemode = 0;
	    return ("|}", &phase1($data));
	}
    }

    # 空行は段落の区切りとなります。
    if ($$data eq "") {
	return ("");
    }

    #行頭がスペースで始まっていると、その段落は整形済みとして扱われます。
    if ($$data =~ /^[\s](.*)/) {
	return (" " . $1);
    }

    #アスタリスク2個(**)を行頭に書くと、小見出しになります。
    if ($$data =~ /^\*\*[\s]*(.*)/) {
	return ("=== " . &phase2(\$1) . " ===");
    }
    #アスタリスク(*)を行頭に書くと、大見出しになります。
    if ($$data =~ /^\*[\s]*(.*)/) {
	return ("== " . &phase2(\$1) . " ==");
    }

    #マイナス(-)を行頭に書くと、箇条書きになります。- -- --- ... のようにして16レベルまで階層が深くなります。
    if ($$data =~ /^([-]{1,16})[\s]*(.*)/) {
	my $tmp;
	my $n = length($1);
	while ($n--) {
	    $tmp .= "\*"
	}
	return ($tmp . &phase2(\$2));
    }

    #イコール3個以上(===)を行頭に書くと、水平線になります。
    if ($$data =~ /^([=]{3,})/) {
	return ("----");
    }

    }

    #>|| と ||< でくくられた行は書いたまま表示されます。リンクや文字飾りも無視されます (スーパーpre)。
    if ($$data =~ /^>\|\|/) {
	return ("<pre>");
    }
    if ($$data =~ /^\|\|</) {
	return ("</pre>");
    }
    #>| と |< でくくられた行は書いたまま表示されますが、リンクや文字飾りは有効です(pre)。
    if ($$data =~ /^>\|/) {
	$g_premode++;
	return ("<pre>");
    }
    if ($$data =~ /^\|</) {
	$g_premode--;
	return ("</pre>");
    }

    return (&phase2($data));
}

sub phase2()
{
    my $data = shift;

    #シングルクォート3個ではさんだ部分は、イタリックになります。
    if ($$data =~ /^(.*)'''([^']*)'''(.*)$/) {
#    if ($$data =~ /^([^']*)'''(.*)'''([^']*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "''" . &phase2(\$b) . "''" . &phase2(\$c);
	} else {
	    return &phase2(\$a) . &phase2(\$b) . &phase2(\$c);
	}
    }

    #シングルクォート2個ではさんだ部分は、ボールドになります。
    if ($$data =~ /^(.*)''([^']*)''(.*)$/) {
#    if ($$data =~ /^([^']*)''(.*)''([^']*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "'''" . &phase2(\$b) . "'''" . &phase2(\$c);
	} else {
	    return &phase2(\$a) . &phase2(\$b) . &phase2(\$c);
	}
    }

    # [[#rcomment]] と入力することで、コメント入力欄（逆順）を設置できます。
    if ($$data =~ /^(.*)(\[\[#rcomment\]\])(.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	return &phase2(\$a) . "<!-- $b -->" . &phase2(\$c);
    }
    # [[#comment]] と入力することで、コメント入力欄を設置できます。
    if ($$data =~ /^(.*)(\[\[#comment\]\])(.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	return &phase2(\$a) . "<!-- $b -->" . &phase2(\$c);
    }

    #&del(deleted string)
    if ($$data =~ /^(.*)&del\(([^\)]*)\)(.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "<del>" . &phase2(\$b) . "</del>" . &phase2(\$c);
	} else {
	    return &phase2(\$a) . &phase2(\$b) . &phase2(\$c);
	}
    }

    #&mark(marking string)
    if ($$data =~ /^(.*)&mark\(([^\)]*)\)(.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "<ins>" . &phase2(\$b) . "</ins>" . &phase2(\$c);
	} else {
	    return &phase2(\$a) . &phase2(\$b) . &phase2(\$c);
	}
    }

    #http://www.hyuki.com/のようなURLは自動的にリンクになります。
    if ($$data =~ /^(.*)(http:\/\/[^\s]+)(.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	return &phase2(\$a) . $b . &phase2(\$c);
    }

    #二重の大かっこ[[ ]]でくくった文字列を書くと、(Wiki)ページの名前になります。大かっこの中にはスペースを含めてはいけません。日本語も使えます。
    if ($$data =~ /^(.*)\[\[([^\]]*)\]\](.*)$/) {
	my $a=$1; my $b=$2; my $c=$3;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "[[" . $b . "]]" . &phase2(\$c);
	} else {
	    return &phase2(\$a) . $b . &phase2(\$c);
	}
    }

    #Google:MacWiki と書くと、「MacWiki」を Google で検索できます。
    if ($$data =~ /^(.*)(((G|g)oogle|(F|f)ind|ISBN|ASIN|asin|(W|w)ikipedia)):"([^"]+)"(.*)$/) {
	my $a=$1; my $b=$2; my $c=$7; my $d = $8;
	if ($b =~ /(G|g)oogle/) {$c =~ s/ /+/g;}
	if ($g_premode == 0) {
	    return &phase2(\$a) . "[[$b:$c]]" . &phase2(\$d);
	} else {
	    return &phase2(\$a) . "$b:$c" . &phase2(\$d);
	}
    }
    if ($$data =~ /^(.*)(((G|g)oogle|(F|f)ind|ISBN|ASIN|asin|(W|w)ikipedia)):([^\s]+)(.*)$/) {
	my $a=$1; my $b=$2; my $c=$7; my $d = $8;
	if ($g_premode == 0) {
	    return &phase2(\$a) . "[[$b:$c]]" . &phase2(\$d);
	} else {
	    return &phase2(\$a) . "$b:$c" . &phase2(\$d);
	}
    }

    #YukiWikiのように大文字小文字を混ぜた英文字列を書くと、(Wiki)ページの名前になります。
    if ($$data =~ /^([A-Z][a-z]+[A-Z][a-z]+[A-Za-z]*)$/) {
	if ($g_premode == 0) {
	    return "[[" . $1 . "]]";
	} else {
	    return $1;
	}
    }
    if ($$data =~ /^(.+)\b(.+)$/) {
	my $a=$1; my $b="$2";
	return &phase2(\$a) . &phase2(\$b);
    }

    return $$data;
}

exit;
