#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: GPAC.pm,v 1.1 2002/07/30 17:44:27 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Tools::GPAC;

use SubOpt;
use G::Messenger;

use strict;
use base qw(Exporter);
use autouse 'Cwd'=>qw(getcwd);
use SelfLoader;

our @EXPORT = qw(
		 gpac
		 set_gpac
		 test_gpac
);


__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::


# modified by Daisuke Kyuma (t00311dk@sfc.keio.ac.jp)

sub _acc2ftp_bacteria{
    my @args = opt_get(@_);
    my $gb = shift @args;

    my $cwd = getcwd();
    my $specie = '';
    my $time = time;
    my @hoge = ();

    mkdir("/tmp/$time"); 
    chdir("/tmp/$time");

    if($gb->{LOCUS}->{id} =~/NC\_/){
        @hoge = `wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/accessions`;       
    }else{
	@hoge = `wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria/accessions`;
    }

    open(FILE, "accessions") || die($!);
    while(<FILE>){
	chomp;
	my ($acc, $nym) = split(/\s/, $_, 2);

	if ($acc eq $gb->{LOCUS}->{id}){
	    $specie = $nym;
	    $specie =~ s/ /_/g;
	    substr($specie, 0, 1) = '' if (substr($specie, 0, 1) eq '_');
	    last;
	}
    }

    chdir($cwd);
    if (length $specie){
	if($gb->{LOCUS}->{id} =~/NC\_/){
	    return "ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/$specie/" 
		. $gb->{LOCUS}->{id};
	}else{
	    return "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Bacteria/$specie/" 
		. $gb->{LOCUS}->{id};
	}
    }else{
        return;
    }
}


sub set_gpac{
    opt_default("ptt"=>'');
    my @args = opt_get(@_);
    my $gb = shift @args;
    my $ptt = opt_val("ptt");
    my ($cds) = (0,0,0,0,0,0,0);
    my $specie;
    my $cwd = getcwd();
    my $time = time;
    my $openfile = '';

    if (length $ptt){
	$openfile = $ptt;
    }else{
	my $file = _acc2ftp_bacteria($gb) . '.ptt';
	mkdir("/tmp/$time"); 
	chdir("/tmp/$time");
	my @hoge = `wget $file`;
	my @name = split(/\//, $file);
	chdir($cwd);
	$openfile = "/tmp/$time/" . pop @name;
    }

    open(FILE, $openfile) || die($!);
    while(<FILE>){
	chomp;

	my ($hypo, $func, $put, $pcog) = (0,0,0,0);

	if (/^\s*[0-9]+\.\.[0-9]+\s+/){
	    s/^\s+//;
	    s/\s+$//;
	    my ($location, $strand, $length, $pid, $gene, 
		$synonym, $code, $cog, $product) = split (/\s+/, $_, 9);

	    $cds ++;
	    $pcog ++ if ($cog =~ /^COG/);
	    my $id = $gb->{"CDS$cds"}->{feature};

	    if (lc($product) =~ /hypothetical/ || lc($product) =~ /^unknown/ 
		|| lc($product) =~ /^orf/ || lc($synonym) eq lc($product)){
		$hypo ++;
	    }elsif (lc($product) =~ /putative/ || lc($product) =~ /similar/ 
		    || lc($product) =~ /probable/){
		$put ++;
	    }else{
		$func ++;
	    }

	    $gb->{"FEATURE$id"}->{code} = $code;
	    $gb->{"FEATURE$id"}->{cog} = $cog;
	    $gb->{"FEATURE$id"}->{synonym} = $synonym;
	    $gb->{"FEATURE$id"}->{gpac} .= '0';
	    $gb->{"FEATURE$id"}->{gpac} .= '1' if ($hypo < 1);
	    $gb->{"FEATURE$id"}->{gpac} .= '2' if ($put < 1 && $hypo < 1);
	    $gb->{"FEATURE$id"}->{gpac} .= '3' if ($pcog);
	    $gb->{"FEATURE$id"}->{gpac} .= '4' if ($pcog && $put < 1 && $hypo < 1);
	    $gb->{"FEATURE$id"}->{gpac} .= '5' if ($pcog || $func || $put);
	}
    }
    close(FILE);

    return $openfile;
}


sub gpac{
    opt_default("ptt"=>'', "elim"=>0);
    my @args = opt_get(@_);
    my $gb = shift @args;
    my $gpac = shift @args;
    my $ptt = opt_val("ptt");
    my $elim = opt_val("elim");
    my $n = 0;

    my $id = $gb->{CDS1}->{feature};
    unless (length($gb->{"FEATURE$id"}->{gpac})){
	set_gpac($gb, -ptt=>$ptt);
    }

    if ($elim){
	foreach my $cds ($gb->cds('all')){
	    if ($gb->{$cds}->{gpac} =~ /$gpac/){
		$gb->{$cds}->{on} = 0;
	    }else{
		$gb->{$cds}->{on} = 1;
		$n ++;
	    }
	}
    }else{
	foreach my $cds ($gb->cds('all')){
	    if ($gb->{$cds}->{gpac} =~ /$gpac/){
		$gb->{$cds}->{on} = 1;
		$n ++;
	    }else{
		$gb->{$cds}->{on} = 0;
	    }
	}
    }

    return $n;
}

sub test_gpac{
    require GD::Graph::bars;
    require Rcmd;

    opt_default("stat"=>"t.test", "verbose"=>5, "output"=>"show",
		"filename"=>"gpac.html", "ptt"=>'');
    my @args = opt_get(@_);
    my $gb = shift @args;
    my $sub = shift @args;
    my $rcmd = new Rcmd;
    my $stat = opt_val("stat");
    my $verbose = opt_val("verbose");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $ptt = opt_val("ptt");
    my $time = time;
    my $numcds = scalar($gb->cds());

    $rcmd->exec('require(ctest)');

    my ($i, $j, @r0, @r1, @r2, @r3, @r4, @r5, 
	@r1rev, @r2rev, @r3rev, @r4rev, @r5rev, 
	$type, @nrev, @ans, @control, @controlrev);
    my @ansrev = (0);
    my $label = 'results';

    no strict;

    for ($i = 0; $i <= 5; $i ++){
	msg_error("Running GPAC Level $i...\n\n");
        $n[$i] = gpac($gb, $i, -ptt=>$ptt);

	@{"r$i"} = &{$sub};
	my @tmp = @{"r$i"};
	if (length $tmp[1]){
	    $type = "ARRAY";
	}else{
	    my $ref = ref $tmp[0];
	    if ($ref){
		$type = $ref;

		if ($ref eq "SCALAR"){
		    @{"r$i"} = ($$tmp[0]);
		    $ans[$i] = $$tmp[0];
		}elsif ($ref eq "ARRAY"){
		    @{"r$i"} = (@$tmp[0]);
		}elsif ($ref eq "HASH"){
		    my @values;
		    foreach my $val (sort keys %{$tmp[0]}){
			push (@values, sprintf("%.4f", ${$tmp[0]}{$val}));
		    }
		    @{"r$i"} = @values;
		}else{
		    die("Return value of the subroutine not fitted for GPAC\n\n");
		}
	    }else{
		$type = "SCALAR";
		$ans[$i] = $tmp[0];
	    }
	}

	if ($type eq 'ARRAY' || $type eq 'HASH'){
	    if ($i == 0){
		$rcmd->exec('x_c(',  join(",\n", @{"r0"}), ')');
		$ans[$i] = 0;
	    }else{
		$rcmd->exec('y_c(',  join(",\n", @{"r$i"}), ')');
		$ans[$i] = $rcmd->exec("$stat\(x,y, paired=T\)\$p\.value");
	    }
	    $label = "p value";
	}
    }

    for ($i = 1; $i <= 5; $i ++){
	msg_error("Running GPAC Level ",  $i, " Eliminated...\n\n");
        $nrev[$i] = gpac($gb, $i, -ptt=>$ptt, -elim=>1);

	@{"r$i" . "rev"} = &{$sub};
	my @tmp = @{"r$i" . "rev"};

	if (defined $tmp[1]){
	    $type = "ARRAY";
	}else{
	    my $ref = ref $tmp[0];
	    if ($ref){
		$type = $ref;

		if ($ref eq "SCALAR"){
		    @{"r$i" . "rev"} = ($$tmp[0]);
		    $ansrev[$i] = $$tmp[0];
		}elsif ($ref eq "ARRAY"){
		    @{"r$i" . "rev"} = (@$tmp[0]);
		}elsif ($ref eq "HASH"){
		    my @values;
		    foreach my $val (sort keys %{$tmp[0]}){
			push (@values, sprintf("%.4f", ${$tmp[0]}{$val}));
		    }
		    @{"r$i" . "rev"} = @values;
		}else{
		    die("Return value of the subroutine not fitted for GPAC\n\n");
		}
	    }else{
		$type = "SCALAR";
		$ansrev[$i] = $tmp[0];
	    }
	}

	if ($type eq 'ARRAY' || $type eq 'HASH'){
	    $rcmd->exec('y_c(',  join(",\n", @{"r$i" . "rev"}), ')');
	    $ansrev[$i] = $rcmd->exec("$stat\(x,y, paired=T\)\$p\.value");
	    $label = "p value";
	}
    }

    
    for ($j = 0; $j < $verbose; $j ++){
	my @tmparray = (0);
	msg_error("Running GPAC Control ", $j + 1, '/' , $verbose, "...\n\n");

        for ($i = 1; $i <= 5; $i ++){
	    _random_elimination($gb, $n[$i]);
	    
	    my @tmp = &{$sub};
	    my @tmp2 = @tmp;

	    unless (defined $tmp[1]){
		my $ref = ref $tmp[0];
		if ($ref){
		    if ($ref eq "SCALAR"){
			$tmparray[$i] = $$tmp[0];
		    }elsif ($ref eq "ARRAY"){
			@tmp2 = (@$tmp[0]);
		    }elsif ($ref eq "HASH"){
			my @values;
			foreach my $val (sort keys %{$tmp[0]}){
			    push (@values, sprintf("%.4f", ${$tmp[0]}{$val}));
			}
			@tmp2 = @values;
		    }else{
			die("Return value of the subroutine not fitted for GPAC\n\n");
		    }
		}else{
		    $tmparray[$i] = $tmp[0];
		}
	    }
	    
	    if ($type eq 'ARRAY' || $type eq 'HASH'){
		$rcmd->exec('y_c(',  join(",\n", @tmp2), ')');
		$tmparray[$i] = $rcmd->exec("$stat\(x,y,paired=T\)\$p\.value");
	    }
	}
	push (@control, [@tmparray]);

        @tmparray = (0);
        for ($i = 1; $i <= 5; $i ++){
	    _random_elimination($gb, $nrev[$i]);
	    
	    my @tmp = &{$sub};
	    my @tmp2 = @tmp;

	    unless (defined $tmp[1]){
		my $ref = ref $tmp[0];
		if ($ref){
		    if ($ref eq "SCALAR"){
			$tmparray[$i] = $$tmp[0];
		    }elsif ($ref eq "ARRAY"){
			@tmp2 = (@$tmp[0]);
		    }elsif ($ref eq "HASH"){
			my @values;
			foreach my $val (sort keys %{$tmp[0]}){
			    push (@values, sprintf("%.4f", ${$tmp[0]}{$val}));
			}
			@tmp2 = @values;
		    }else{
			die("Return value of the subroutine not fitted for GPAC\n\n");
		    }
		}else{
		    $tmparray[$i] = $tmp[0];
		}
	    }
	    
	    if ($type eq 'ARRAY' || $type eq 'HASH'){
		$rcmd->exec('y_c(',  join(",\n", @tmp2), ')');
		$tmparray[$i] = $rcmd->exec("$stat\(x,y,paired=T\)\$p\.value");
	    }
	}
	push (@controlrev, [@tmparray]);
    }

    my (@data, @datarev, @legend, @temp, @temprev);
    if ($verbose > 0){
	@data = ([0,1,2,3,4,5], \@ans, @control);
	@datarev = ([0,1,2,3,4,5], \@ansrev, @controlrev);
	@legend = ("Result");
	for ($j = 1; $j <= $verbose; $j ++){
	    push(@legend, "Control $j");
	}
    }else{
	@data = ([0,1,2,3,4,5], \@ans);
	@datarev = ([0,1,2,3,4,5], \@ansrev);
    }

    @temp = @ans;
    @temprev = @ansrev;
    foreach my $ctrl (@control){
	@temp = (@temp, @$ctrl);
    }
    foreach my $ctrl (@controlrev){
	@temprev = (@temprev, @$ctrl);
    }
    my @temp3 = (@temp, @temprev); 
    my @temp2 = sort {$b <=> $a} @temp3;
    my $max = shift(@temp2) * 1.1;
    my @expo = split(/e/, sprintf("%e", $max, 2));
    $max = sprintf("%.2f", $expo[0]) * (10 ** $expo[1]);

    my @data2 = ([1,2,3,4,5], [$n[1]/$n[0], $n[2]/$n[0], 
			       $n[3]/$n[0], $n[4]/$n[0], $n[5]/$n[0]],
                              [$nrev[1]/$n[0], $nrev[2]/$n[0], $nrev[3]/$n[0],
                               $nrev[4]/$n[0], $nrev[5]/$n[0]]);

    my $graph = GD::Graph::bars->new(400, 300);
    my $graph2 = GD::Graph::bars->new(250, 270);
    my $graph3 = GD::Graph::bars->new(400, 300);

    $graph->set( 
		 x_label           => 'GPAC',
		 y_label           => $label,
		 title             => 'GPAC Test',
		 y_max_value       => $max,
		 bar_width         => 2,
		 bar_spacing       => 1
		 );

    $graph2->set( 
		  x_label           => 'GPAC',
		  y_label           => 'percentage of CDS',
		  y_max_value       => 1,
		  y_min_value       => 0,
		  title             => 'GPAC Degree of Freedom',
		  bar_width         => 5,
		  bar_spacing       => 10
		 );

    $graph3->set( 
		 x_label           => 'GPAC',
		 y_label           => $label,
		 title             => 'GPAC Eliminated Test',
		 y_max_value       => $max,
		 bar_width         => 2,
		 bar_spacing       => 1
		 );

    $graph->set_legend(@legend) if ($verbose > 0);
    $graph2->set_legend("GPAC CDS", "Eliminated CDS");
    $graph3->set_legend(@legend) if ($verbose > 0);

    my $gd = $graph->plot(\@data);
    my $gd2 = $graph2->plot(\@data2);
    my $gd3 = $graph3->plot(\@datarev);

    mkdir ("graph", 0777);
    open(IMG, '>graph/' . $time . '-gpac.png') or die $!;
    binmode IMG;
    print IMG $gd->png;
    close(IMG);

    open(IMG, '>graph/' . $time . '-df.png') or die $!;
    binmode IMG;
    print IMG $gd2->png;
    close(IMG);

    open(IMG, '>graph/' . $time . '-elim.png') or die $!;
    binmode IMG;
    print IMG $gd3->png;
    close(IMG);

    open(HTML, '>graph/' . $filename) or die $!;
    print HTML qq(
		  <HTML><HEAD><TITLE>GPAC Test</TITLE></HEAD>
		  <BODY bgcolor="white">
		  <table><tr><td valign="top">
		  <img src="$time-gpac.png">
		  </td><td valign="top">
		  <img src="$time-elim.png">
		  </td></tr>
		  <tr><td style="font-size:10pt">
		  <UL>
		  <LI>GPAC Level 0: all CDS<BR>
		  <LI>GPAC Level 1: without hypothetical CDS<BR>
		  <LI>GPAC Level 2: without hypothetical & putative CDS<BR>
		  <LI>GPAC Level 3: only conserved CDS (COGs)<BR>
		  <LI>GPAC Level 4: only conserved CDS, but without hypothetical & putative<BR>
		  <LI>GPAC Level 5: conserved hypothetical & putative and functional proteins<BR>
		  </UL>
		  </td><td align="right">
		  <img src="$time-df.png"> <BR>
		  N = $numcds
		  </td></tr></table>
		  <BR><BR>
		  </BODY></HTML>
		  );
    system("galeon graph/$filename &") if ($output eq 'show');
}





sub _random_elimination {
    my $gb = shift;
    my $n = shift;
    my @cds = $gb->cds('all');

    my $j = scalar(@cds);
    
    while ($j > $n){
	splice(@cds, int(rand($j)), 1);
	$j --;
    }

    foreach my $cds ($gb->cds('all')){
	$gb->{$cds}->{on} = 0;
    }

    foreach my $cds (@cds){
	$gb->{$cds}->{on} = 1;
    }
}


1;
