#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Consensus.pm,v 1.2 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::Consensus;

use SubOpt;
use G::Tools::Graph;
use G::Messenger;
use G::Seq::Util;
use G::Seq::Primitive;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
	     base_counter
	     base_entropy
	     base_information_content
	     base_relative_entropy
	     base_z_value
	     base_individual_information_matrix
	     consensus_z
);

__DATA__

#::::::::::::::::::::::::::::::
#     Perldoc
#::::::::::::::::::::::::::::::

=head1 NAME

G::Seq::Consensus - Analysis methods related to sequence consensus

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to sequence consensus.

=cut

#::::::::::::::::::::::::::::::
#     Let the code begin...
#::::::::::::::::::::::::::::::

=head2 base_counter

  Name: base_counter   -   creates a position weight matrix of oligomers around start codon

  Description:
    This function creates a position weight matrix (PWM) of 
    oligomers of specified length around the start codon of all 
    genes in the given genome.

  Usage:
    $structure = base_counter($genome);

    Here the $structure is a reference to the array of hashes, intended for 
    internal usage.

  Options:
    -output     "stdout" for standard output (default: "stdout")
                "f" for file output in directory "data"
    -filename   output filename (default: "base_counter.csv")
    -position   either "start" (around start codon) or "end" (around stop codon)
                to create the PWM (default: "start");
    -PatLen     length of oligomer to count (default: 3)
    -upstream   length upstream of specified position to create PWM (default: 30)
    -downstream length downstream of specified position to create PWM (default: 30)

  Author: 
    Koya Mori(mory@g-language.org)

  History:
   20010623-01 initial posting (ported from ATG7 program, "bun")

=cut

sub base_counter{
    &opt_default(output=>"stdout",filename=>"base_counter.csv",PatLength=>3,upstream=>30,downstream=>30,position=>"start");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my @hash;
    my $num=1;
    my $start;
    my $i;
    my $nuc;
    my $cnuc;

    $PatLen--;
    $downstream++;

    foreach my $cds ($gb->cds()){
	if($gb->{$cds}->{direction} eq 'direct'){
	    if($position eq "start"){
		$start = $gb->{$cds}->{start};
	    }
	    elsif($position eq "end"){
		$start = $gb->{$cds}->{end};
	    }
	    else{
		&msg_error('Invalid parameter.Please enter "start" or "end".',"\n");
	    }

	    for($i=$upstream;$i>-$downstream;$i--){
		if($start-1-$i>0){
		    $hash[$upstream-$i]{kazu}++;
		    $nuc=$gb->getseq($start-1-$i,$start-1-$i+$PatLen);
		    next if(length($nuc != $PatLen));
		    $hash[$upstream-$i]{$nuc}++;
		    $hash[0]{pat}{$nuc}++;
		}
	    }
	}
	elsif($gb->{$cds}->{direction} eq 'complement'){
	    if($position eq "start"){
		$start = $gb->{$cds}->{end};
	    }
	    elsif($position eq "end"){
		$start = $gb->{$cds}->{start};
	    }
	    else{
		&msg_error('Invalid parameter.Please enter "start" or "end".',"\n");
	    }

	    for($i=$upstream;$i>-$downstream;$i--){
		if($start-1+$i<length($gb->{SEQ})){
		    $hash[$upstream-$i]{kazu}++;
		    $nuc=$gb->getseq($start-1+$i-$PatLen,$start-1+$i);
		    $cnuc=complement($nuc);
		    $hash[$upstream-$i]{$cnuc}++;
		    $hash[0]{pat}{$cnuc}++;
		}
	    }

	}
    }


    $hash[0]{upstream}=$upstream;
    $hash[0]{downstream}=$downstream;
    $hash[0]{PatLen}=$PatLen;
#    $hash[0]{gbk}=$gb;
    $hash[0]{pos}=$position;

    if($output eq "f"){
	_base_printer(\@hash,-output=>"f",-filename=>$filename);
    }
    if($output eq "stdout"){
	_base_printer(\@hash,-output=>"stdout");
    }
    return \@hash;
}


#base_printer ver.20010623-01
#scripting by Koya Mori(mory@g-language.org)
#This program prints result of base_counter.
#need &bun.
#&base_print(pointer Genome,  string position,  int pattern_length, int upstream_limit,  int downstream_limit,  boolean debug);
sub _base_printer{
    &opt_default(output=>"stdout",filename=>"base_counter.csv");
    my @args=opt_get(@_);

    my $hash=shift @args;
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $PatLen=$$hash[0]{PatLen};
    my $upstream=$$hash[0]{upstream};
    my $downstream=$$hash[0]{downstream};
#    my $gb=$$hash[0]{gbk};
    my $position=$$hash[0]{pos};
    my $i;
    my $j;


    if($printer eq "f"){
        mkdir ("data", 0777);
        open(FILE,">>data/$filename");
	print FILE "Pattern";
	for(my $i=$upstream;$i>-$downstream;$i--){
	    print FILE ",$i";
	}
	print FILE "\n";
	foreach(sort keys(%{$$hash[0]{pat}})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "gbk" && $_ ne "kazu" && $_ ne "pos"){
		print FILE $_;
		for(my $i=$upstream;$i>-$downstream;$i--){
		    printf FILE ",%d",$$hash[$i]{$_};
		}
		print FILE "\n";
	    }
	}
	print FILE "\n\n";

	print FILE "Pattern";
	for(my $i=$upstream;$i>-$downstream;$i--){
	    print FILE ",$i";
	}
	print FILE "\n";
	foreach(sort keys(%{$$hash[0]{pat}})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "gbk" && $_ ne "kazu" && $_ ne "pos"){
		print FILE $_;
		for(my $i=$upstream;$i>-$downstream;$i--){
		    printf FILE ",%03.3f",$$hash[$i]{$_}*100/$$hash[$i]{kazu};
		}
		print FILE "\n";
	    }
	}
	
	close(FILE);
    }

    if($printer eq "stdout"){
	&msg_send("CDS number:",$$hash[$upstream-1]{kazu},"\n");
	for($i=$upstream;$i>-$downstream;$i=$i-12){
	    for(my $c=0;$c<102;$c++){
		&msg_send("-");
	    }
	        
	    &msg_send("\npos\t");
	    for(my $c=0;$c<12 && $i-$c > -$downstream;$c++){
		&msg_send(sprintf("%6d\t",-($i-$c)));
	    }
	    &msg_send("\n");
	        
	    foreach(sort keys(%{$$hash[0]{pat}})){
		if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "gbk" && $_ ne "kazu" && $_ ne "pos"){
		    &msg_send("\n $_|\t");
		    for($j=0;$j<12 && $i-$j>-$downstream;$j++){ 
			&msg_send(sprintf("%6d\t",$$hash[$upstream-$i+$j]{$_}));
		    }
		}
	    }
	    &msg_send("\n");
	        
	    foreach(sort keys(%{$$hash[0]{pat}})){
		if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "gbk" && $_ ne "kazu" && $_ ne "pos"){
		    &msg_send(sprintf("\n $_|\t"));
		    for($j=0;$j<12 && $i-$j>-$downstream;$j++){        
			&msg_send(sprintf("%03.3f\t",$$hash[$upstream-$i+$j]{$_}*100/$$hash[$upstream-$i+$j]{kazu}));
		    }
		}
	    }
	    &msg_send("\n");
	}
    }
}


sub logmod{
    return $_[0] == 0 ? 0 : log($_[0]);
}


=head2 base_entropy

  Name: base_entropy   -   calculates and graphs the sequence conservation using Shanon uncertainty (entropy)

  Description:
    This function calculates and graphs the sequence conservation in regions 
    around the start/stop codons using Shanon uncertainty (entropy).

  Usage:
    ($position, "entropy", $upstream-length, $downstream-length, @entropyvalues) = base_entropy($genome);

 Options:
    -output     "stdout" for standard output
                "f" for file output in directory "data",
                "g" for graph output in directory "graph",
                "show" for graph output and display (default: "show")
    -filename   output filename (default: "base_entropy.png" for -output=>"g",
                                          "base_entropy.csv" for -output=>"f")
    -position   either "start" (around start codon) or "end" (around stop codon)
                to create the PWM (default: "start");
    -PatLen     length of oligomer to count (default: 3)
    -upstream   length upstream of specified position to create PWM (default: 30)
    -downstream length downstream of specified position to create PWM (default: 30)

  Author: 
    Koya Mori(mory@g-language.org)

  History:
    20010623-01 initial posting (ported from ATG7 program, "buns::ent")

=cut

sub base_entropy{
    &opt_default(output=>"show",filename=>"base_entropy.png",PatLength=>3,upstream=>30,downstream=>30,position=>"start");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $hash;
    my %ratio;
    my @loc_ratio;
    my @loc_total;
    my @entropy;
    my $offset;
    my %seqnum;
    my $tmp;
    my @y;

    $hash=base_counter($gb,-output=>"n",-position=>$position,-PatLength=>$PatLen,-upstream=>$upstream,-downstream=>$downstream);
    $downstream++;
    
    if($printer=~/f/){
	$filename =~ s/\.png$/\.csv/;
	mkdir ("data", 0777);
	open(FILE,">>data/$filename");
    }

    for(my $i=$upstream;$i>-$downstream;$i--){
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		unless(defined($seqnum{$_})){
		    $offset=$tmp=0;
		    while($tmp!=-1){
			$tmp=index($gb->{SEQ},$_,$offset);
			$offset=$tmp+1;
			$seqnum{$_}++ if($tmp!=-1);
		    }
		}
		$ratio{$_}=$seqnum{$_}/(length($gb->{SEQ})-length($_)+1);
		if($ratio{$_} != 0){
		    $loc_ratio[$upstream-$i]{$_}=$$hash[$upstream-$i]{$_}/$$hash[$upstream-$i]{kazu};
		    $loc_ratio[$upstream-$i]{$_}=$loc_ratio[$upstream-$i]{$_}/$ratio{$_};
		    $loc_total[$upstream-$i]+=$loc_ratio[$upstream-$i]{$_};
		}
		else{
		    $entropy[$upstream-$i]=0;
		}
	    }
	}
	
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		$entropy[$upstream-$i]-=$loc_ratio[$upstream-$i]{$_}/$loc_total[$upstream-$i]
		    *&logmod($loc_ratio[$upstream-$i]{$_}/$loc_total[$upstream-$i])/&logmod(2.0)/length($_);
	    }
	}
	if($printer=~/f/){
	    printf FILE "%d,%.5f\n",-$i ,$entropy[$upstream-$i];
	}
	if($printer eq 'stdout'){
	    &msg_send(sprintf("%d\t%.5f\n",-$i ,$entropy[$upstream-$i]));
	}
	push(@y,-$i);
    }
    close(FILE);
    
    if($printer=~/g/ || $printer=~/show/){
	&G::Tools::Graph::_UniMultiGrapher(\@y,\@entropy,-filename=>$filename,-x=>"position",-y=>"entropy",-title=>"entropy");
    }

    msg_gimv("graph/$filename") if($printer=~/show/);

    $tmp="entropy";
    @entropy=($position,$tmp,$upstream,$downstream-1,@entropy);
    return @entropy;
}


=head2 base_information_content

  Name: base_information_content   -   calculates and graphs the sequence conservation using information content

  Description:
    This function calculates and graphs the sequence conservation in regions 
    around the start/stop codons using information content.

  Usage:
    ($position, "IC", $upstream-length, $downstream-length, @entropyvalues) = base_information_content($genome);

 Options:
    -output     "stdout" for standard output
                "f" for file output in directory "data",
                "g" for graph output in directory "graph",
                "show" for graph output and display (default: "show")
    -filename   output filename (default: "base_information_content.png" for -output=>"g",
                                          "base_information_content.csv" for -output=>"f")
    -position   either "start" (around start codon) or "end" (around stop codon)
                to create the PWM (default: "start");
    -PatLen     length of oligomer to count (default: 3)
    -upstream   length upstream of specified position to create PWM (default: 30)
    -downstream length downstream of specified position to create PWM (default: 30)

  Author: 
    Koya Mori(mory@g-language.org)

  History:
    20010707-01 initial posting (ported from ATG7 program, "buns::rseq")

=cut

sub base_information_content{
    &opt_default(output=>"show",filename=>"base_information_content.png",PatLength=>3,upstream=>30,downstream=>30,position=>"start");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $hash;
    my @IC;
    my $hgenome;
    my %ratio;
    my @loc_ratio;
    my $offset;
    my %seqnum;
    my $tmp;
    my @y;
    
    $hash=base_counter($gb,-output=>"n",-position=>$position,-PatLength=>$PatLen,-upstream=>$upstream,-downstream=>$downstream);
    $downstream++;

    if($printer=~/f/){
        $filename =~ s/\.png$/\.csv/;
        mkdir ("data", 0777);
        open(FILE,">>data/$filename");
    }

    for(my $i=$upstream;$i>-$downstream;$i--){
	$hgenome=0;
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		unless(defined($seqnum{$_})){
		    $offset=$tmp=0;
		    while($tmp!=-1){
			$tmp=index($gb->{SEQ},$_,$offset);
			$offset=$tmp+1;
			$seqnum{$_}++ if($tmp!=-1);
		    }
		}
		$ratio{$_}=$seqnum{$_}/(length($gb->{SEQ})-length($_)+1);
		$hgenome-=$ratio{$_} * logmod($ratio{$_})/logmod(2.0);
		$loc_ratio[$upstream-$i]{$_}=$$hash[$upstream-$i]{$_}/$$hash[$upstream-$i]{kazu};
	    }
	}
	
	$IC[$upstream-$i]=$hgenome - (4-1)/(2.0 * logmod(2.0) * $$hash[$upstream-$i]{kazu});

	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		$IC[$upstream-$i]+=logmod($loc_ratio[$upstream-$i]{$_})/logmod(2.0)*$loc_ratio[$upstream-$i]{$_};
	    }
	}
	if($printer=~/f/){
	    printf FILE "%d,%.5f\n",-$i ,$IC[$upstream-$i];
	}
	if($printer eq 'stdout'){
	    &msg_send(sprintf("%d\t%.5f\n",-$i ,$IC[$upstream-$i]));
	}
	push(@y,-$i);
    }

    close(FILE);

    if($printer=~/g/ || $printer=~/show/){
	&G::Tools::Graph::_UniMultiGrapher(\@y,\@IC,-filename=>$filename,-x=>"position",-y=>"information_content",-title=>"information_content");
    }

    msg_gimv("graph/$filename") if($printer=~/show/);

    $tmp="IC";
    @IC=($position,$tmp,$upstream,$downstream-1,@IC);
    return \@IC;
}


=head2 base_relative_entropy

  Name: base_relative_entropy   -   calculates and graphs the sequence conservation using Kullback-Leibler divergence (relative entropy)

  Description:
    This function calculates and graphs the sequence conservation in regions 
    around the start/stop codons using Kullback-Leibler divergence (relative entropy).

  Usage:
    ($position, "RE", $upstream-length, $downstream-length, @entropyvalues) = base_relative_entropy($genome);

  Options:
    -output     "stdout" for standard output
                "f" for file output in directory "data",
                "g" for graph output in directory "graph",
                "show" for graph output and display (default: "show")
    -filename   output filename (default: "base_relative_entropy.png" for -output=>"g",
                                          "base_relative_entropy.csv" for -output=>"f")
    -position   either "start" (around start codon) or "end" (around stop codon)
                to create the PWM (default: "start");
    -PatLen     length of oligomer to count (default: 3)
    -upstream   length upstream of specified position to create PWM (default: 30)
    -downstream length downstream of specified position to create PWM (default: 30)

  Author: 
    Koya Mori(mory@g-language.org)

  History:
    20010714-01 initial posting (ported from ATG7 program, "buns::ic")

=cut

sub base_relative_entropy{
    &opt_default(output=>"show",filename=>"base_relative_entropy.png",PatLength=>3,upstream=>30,downstream=>30,position=>"end");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $hash;
    my @RE;
    my %ratio;
    my @loc_ratio;
    my $offset;
    my %seqnum;
    my $tmp;
    my @y;
    
    $hash=base_counter($gb,-output=>"n",-position=>$position,-PatLength=>$PatLen,-upstream=>$upstream,-downstream=>$downstream);
    $downstream++;

    if($printer=~/f/){
        $filename =~ s/\.png$/\.csv/;
        mkdir ("data", 0777);
        open(FILE,">>data/$filename");
    }

    for(my $i=$upstream;$i>-$downstream;$i--){
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		unless(defined($seqnum{$_})){
		    $offset=$tmp=0;
		    while($tmp!=-1){
			$tmp=index($gb->{SEQ},$_,$offset);
			$offset=$tmp+1;
			$seqnum{$_}++ if($tmp!=-1);
		    }
		}
		$ratio{$_}=$seqnum{$_}/(length($gb->{SEQ})-length($_)+1);
		$loc_ratio[$upstream-$i]{$_}=$$hash[$upstream-$i]{$_}/$$hash[$upstream-$i]{kazu};
		$RE[$upstream-$i]+=logmod($loc_ratio[$upstream-$i]{$_}/$ratio{$_})/logmod(2.0)*$loc_ratio[$upstream-$i]{$_};
	    }
	}
	if($printer=~/f/){
	    printf FILE "%d,%.5f\n",-$i ,$RE[$upstream-$i];
	}
	if($printer eq 'stdout'){
	    &msg_send(sprintf("%d\t%.5f\n",-$i ,$RE[$upstream-$i]));
	}
	push(@y,-$i);
    }
    close(FILE);

     if($printer=~/g/ || $printer=~/show/){
	&G::Tools::Graph::_UniMultiGrapher(\@y,\@RE,-filename=>$filename,-x=>"position",-y=>"relative_entropy",-title=>"relative_entropy");
    }

    msg_gimv("graph/$filename") if($printer=~/show/);
   
    $tmp="RE";
    @RE=($position,$tmp,$upstream,$downstream-1,@RE);
    return \@RE;
}


=head2 base_z_value

  Name: base_z_value   -   extracts conserved oligomers per position using Z-score

  Description:
    This function calculates and extracts conserved oligomers per position using Z-score,
    in regions around the start/stop codons using Z-score

  Usage:
    $structure = base_z_value($genome);

    Here the $structure is a reference to the array of hashes, intended for 
    internal usage.


  Options:
    -output     "stdout" for standard output (default: "stdout")
                "f" for file output in directory "data"
    -filename   output filename (default: "base_z_value.csv")
    -limit      rank threshold for showing the conserved oligomer (default: 5)
    -position   either "start" (around start codon) or "end" (around stop codon)
                to create the PWM (default: "start");
    -PatLen     length of oligomer to count (default: 3)
    -upstream   length upstream of specified position to create PWM (default: 30)
    -downstream length downstream of specified position to create PWM (default: 30)

  Author: 
    Koya Mori(mory@g-language.org)

  History:
    20010714-01 initial posting (ported from ATG7 program, "buns::ic")

=cut

sub base_z_value{
    &opt_default(limit=>5,output=>"stdout",filename=>"base_z_value.csv",PatLength=>3,upstream=>30,downstream=>30,position=>"start");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $limit=opt_val("limit");
    my $hash;
    my @Z;
    my @Z_value;
    my @Z_tmp;
    my %ratio;
    my $offset;
    my %seqnum;
    my $tmp;
    my @tmp;
    my @key;
    my $c;
    
    $hash=base_counter($gb,-output=>"n",-position=>$position,-PatLength=>$PatLen,-upstream=>$upstream,-downstream=>$downstream);
    $downstream++;

    if($printer=~/f/){
        mkdir ("data", 0777);
        open(FILE,">>data/$filename");
    }

    for(my $i=$upstream;$i>-$downstream;$i--){
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		unless(defined($seqnum{$_})){
		    $offset=$tmp=0;
		    while($tmp!=-1){
			$tmp=index($gb->{SEQ},$_,$offset);
			$offset=$tmp+1;
			$seqnum{$_}++ if($tmp!=-1);
		    }
		}
		$ratio{$_}=$seqnum{$_}/(length($gb->{SEQ})-length($_)+1);
		$Z_tmp[$upstream-$i]{$_}=($$hash[$upstream-$i]{$_}-$$hash[$upstream-$i]{kazu}*$ratio{$_})
		    /sqrt($$hash[$upstream-$i]{kazu}*$ratio{$_}*(1-$ratio{$_}));
	    }
	}

	foreach(keys(%{$Z_tmp[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		push(@Z_value,sprintf("%.5f",$Z_tmp[$upstream-$i]{$_})." ".$_);
	    }
	}

	$c=1;
	foreach(sort{$b <=> $a}@Z_value){
	    @tmp=split(/ +/,$_);
	    $Z[$upstream-$i]{$c}{$tmp[1]}=$tmp[0];
	    $c++;
	    last if($c>$limit);
	}

	if($i==0){
	    if($printer eq "f"){
		print FILE "position:0,";
	    }
	    if($printer eq "stdout"){
		&msg_send("position:0\n");
	    }
	}
	else{
	    if($printer eq "f"){
		print FILE "position:",-1*$i,",";
	    }
	    if($printer eq "stdout"){
		&msg_send("position:",-1*$i,"\n");
	    }
	}
	foreach(sort{$a <=> $b} keys(%{$Z[$upstream-$i]})){
	    if($printer eq "f"){
		print FILE $_,",",(keys(%{$Z[$upstream-$i]{$_}}))[0],",",
		$Z[$upstream-$i]{$_}{(keys(%{$Z[$upstream-$i]{$_}}))[0]},"\n";
	    }
	    if($printer eq "stdout"){
		&msg_send($_,"\t",(keys(%{$Z[$upstream-$i]{$_}}))[0],"\t",
		$Z[$upstream-$i]{$_}{(keys(%{$Z[$upstream-$i]{$_}}))[0]},"\n");
	    }
	}
	if($printer eq "f"){
	    print FILE "";
	}
	if($printer eq "stdout"){
	    &msg_send("\n");
	}
	@Z_value=();
    }
    return \@Z;
    close(FILE);
}


#base_individual_information_matrix ver.20010714-01
#scripting by Koya Mori(mory@g-language.org)
#This program calculates individual information matrix of base_counter.
#need &base_counter.
#(array result)=&base_individual_information_matrix(hash base_counter_result,  boolean debug);
sub base_individual_information_matrix{
    &opt_default(output=>"stdout",filename=>"consensus.csv",PatLength=>3,upstream=>30,downstream=>30,position=>"end");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $position=opt_val("position");
    my $PatLen=opt_val("PatLength");
    my $upstream=opt_val("upstream");
    my $downstream=opt_val("downstream");
    my $printer=opt_val("output");
    my $filename=opt_val("filename");
    my $hash;
    my $hgenome;
    my $tmp;
    my $offset;
    my %seqnum;
    my %ratio;
    my @loc_ratio;
    my @IIM;
    my %q;

    $hash=base_counter($gb,-output=>"n",-position=>$position,-PatLength=>$PatLen,-upstream=>$upstream,-downstream=>$downstream);
    $downstream++;

    for(my $i=$upstream;$i>-$downstream;$i--){
	$hgenome=0;
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		unless(defined($seqnum{$_})){
		    $offset=$tmp=0;
		    while($tmp!=-1){
			$tmp=index($gb->{SEQ},$_,$offset);
			$offset=$tmp+1;
			$seqnum{$_}++ if($tmp!=-1);
		    }
		}
		$ratio{$_}=$seqnum{$_}/(length($gb->{SEQ})-length($_)+1);
		$hgenome-=$ratio{$_} * logmod($ratio{$_})/logmod(2.0);
		$loc_ratio[$upstream-$i]{$_}=$$hash[$upstream-$i]{$_}/$$hash[$upstream-$i]{kazu};
		$loc_ratio[$upstream-$i]{$_}=1/($$hash[$upstream-$i]{kazu}+2) if($loc_ratio[$upstream-$i]{$_}==0);
	    }
	}
	
	foreach(keys(%{$$hash[$upstream-$i]})){
	    if($_ ne "PatLen" && $_ ne "upstream" && $_ ne "downstream" && $_ ne "kazu" && $_ ne "pat" && $_ ne "gbk" && $_ ne "pos"){
		$IIM[$upstream-$i]{$_}=$hgenome-(4-1)/(2.0*logmod(2.0)*$$hash[$upstream-$i]{kazu})+
		    logmod($loc_ratio[$upstream-$i]{$_})/logmod(2.0);
	    }
	} 
    }

    if($printer eq "f"){
	open(FILE,">>$filename");
	print FILE "Pattern";
	for(my $i=$upstream;$i>-$downstream;$i--){
	    print FILE ",$i";
	}
	print FILE "\n";
	foreach(sort keys(%{$$hash[0]{pat}})){
	    print FILE $_;
	    for(my $i=$upstream;$i>-$downstream;$i--){
		print FILE sprintf(",%.3f",$IIM[$i]{$_});
	    }
	    print FILE "\n";
	}
	close(FILE);
    }
    if($printer eq "stdout"){
	for(my $i=$upstream;$i>-$downstream;$i += 0){
	    for(my $c=0;$c<102;$c++){
		&msg_send("-");
	    }
	    &msg_send("\npos");
	    for(my $c=0;$c<10;$c++){
		&msg_send("\t",-$i+$q{$_}*10);
		$i--;
		last if($i<=-$downstream);    
	    }
	    &msg_send("\n\n");

	    foreach(sort keys %{$$hash[0]{pat}}){
		&msg_send("$_\|");
		for(my $c=0;$c<10;$c++){
		    &msg_send(sprintf("\t%.3f",$IIM[$c+$q{$_}*10]{$_}));
		    last if($c+$q{$_}*10>=$downstream+$upstream-1);
		}
		&msg_send("\n");
		$q{$_}++;
	    }
	}
    }
    return \@IIM;
}


=head2 consensus_z

  Name: consensus_z   -   calculate consensus in given array of sequences

  Description:
    Calculates the consensus of given list of sequences, using Z-score.
    Consensus with Z value greater than int high_z (default is 1) is 
    capitalized, and consensus with Z value less than int low_z 
    (default is 0.2) is shown as '-'.

  Usage:
     $consensus_sequence = &consensus_z(\@array_of_sequences);

      @array_of_sequences is a reference to an array of sequences.
      The sequences must be of same lengths.
       eg. consensus_z(["atgc","ctaa","tttt","cttg"]);
      returns 'cTt-'

  Options:
     -high      z value greater than which is significant (default: 1)
     -low       z value less than which is insignificant (default: 0.2)
     -filename  outfile name (default: consensus_z.png for -output=>"g",
                                       consensus_z.csv for -output=>"f")
     -output    "g" for graph, "f" for file, "show" for display

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
   20010906-01 update with options
   20010713-01 initial posting

=cut

sub consensus_z{
    require Statistics::Descriptive;

    &opt_default(high=>1, low=>0.2, output=>"show", 
		  filename=>"consensus_z.png");
    my @args = opt_get(@_);
    my $ref = shift @args;
    my $high_z = opt_val("high");
    my $low_z = opt_val("low");
    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq "f");
    my $output = opt_val("output");
    my @inseq = @$ref;
    my ($i,$tmp,$outseq,@seq,@array,%nuc_table, @out, @pos);
    my $length = length($inseq[0]);
    my $rows = @inseq;
    my $nuc_max = 0;

    foreach $tmp (@inseq){
	for ($i = 0; $i < $length; $i++){
	    $seq[$i]{substr($tmp, $i,1)} += 1/$rows;
	    $nuc_table{substr($tmp, $i,1)} ++;
	    push (@array, $seq[$i]{substr($tmp,$i,1)});
	}
    }

    foreach $tmp (keys %nuc_table){$nuc_max ++;}
    
    for ($i = 0; $i < $length; $i++){
	my $max = 0.0;
	my $max_index;
	my $nuc = '';

	foreach $nuc (keys %{$seq[$i]}){
	    if ($seq[$i]{$nuc} > $max){
		$max_index = lc($nuc);
		$max = $seq[$i]{$nuc};
	    }
	}

	my $stat = Statistics::Descriptive::Full->new();
	$stat->add_data(@array);

	my $z = ($max - $stat->mean())/$stat->standard_deviation();
	$max_index = '-' if ($z <= $low_z);
	$max_index = uc($max_index) if ($z >= $high_z);
	$outseq = $outseq . $max_index;
	push (@out, $z);
	push (@pos, $i);
    }

    if ($output eq 'g' || $output eq 'show'){
	&G::Tools::Graph::_UniMultiGrapher(
					 \@pos, \@out,-x=>"position", 
					 -y=>"Z value",
					 -title=>"Consensus by Z value", 
					 -filename=>$filename
					 );

	msg_gimv("graph/$filename") if ($output eq "show");
    }elsif ($output eq 'f'){
	$, = ',';
	mkdir ('data', 0777);
	open(OUT, '>data/' . $filename);
	print OUT @out, "\n";
	close(OUT);
	$, = '';
    }
    return $outseq;
}



1;
