#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: GCskew.pm,v 1.3 2002/08/16 15:03:36 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::GCskew;

use SubOpt;
use G::Messenger;
use G::Tools::Graph;

use strict;
use Statistics::Descriptive;
use SelfLoader;

require Exporter;

our @ISA = qw(Exporter);

our @EXPORT = qw(
	     find_ori_ter
	     gcskew
	     cum_gcskew
	     genomicskew
	     gcwin
	     leading_strand
	     query_strand
	     query_arm
	     set_strand
	     set_gc3
	     genes_from_ori
	     rep_ori_ter
	     view_cds
	     dist_in_cc
);


__DATA__

#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

    G::Seq::GCskew - Analysis methods related to GC skew and genomic strand bias

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to GC skew.

=cut



#:::::::::::::::::::::::::::::::::
#       Let the code begin...
#:::::::::::::::::::::::::::::::::


=head2 view_cds

  Name: view_cds   -   displays a graph of nucleotide contents around start and stop codons

  Description:
    This method creates a graph showing the average A,T,G,C contents
    around start/stop codons. This is useful to view consensus around
    start/stop codons and to find characteristic pattern in CDS. 
    
  Usage : 
    view_cds(G instance);

  Options:
    -length    length in bases to show around start/stop codons
               (default: 100)
    -gap       gap shown in graph in between start/stop codon neighbors
               (default: 3)
    -filename  outfile name   (default: view_cds.png for graph, 
               view_cds.csv for file)
    -output    "f" for file, "g" for graph, "show" to display graph. 
               (default: "show")

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010906-01 initial posting

=cut



sub view_cds{
    &opt_default(length=>100, filename=>"view_cds.png", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my (@a, @t, @g, @c, @pos);
    my $numcds = 0;
    my $i = 0;
    my $length = opt_val("length");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $application = opt_val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt_val("filename") eq "view_cds.png");
    my $gap = opt_val("gap");

    while(defined %{$gb->{"CDS$numcds"}}){ $numcds ++ }

    for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	$a[$i] = 0;
	$t[$i] = 0;
	$g[$i] = 0;
	$c[$i] = 0;
    }

    foreach my $cds ($gb->cds()){
	my $seq;
	$seq  = $gb->before_startcodon($cds, $length);
	$seq .= $gb->startcodon($cds);
	$seq .= $gb->after_startcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i] += 100/$numcds;
	    }
        }
	
	$seq  = $gb->before_stopcodon($cds, $length);
	$seq .= $gb->stopcodon($cds);
	$seq .= $gb->after_stopcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i + length($seq) + $gap] += 100/$numcds;
	    }
	}
    }
    
    for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){
	push(@pos, $i);
    }

    if ($output eq "g" || $output eq "show"){
	_UniMultiGrapher(
			 \@pos, -x => "position", -y => "percentage",
			 \@a, -x1=>"A", \@t, -x2=>"T",
			 \@g, -x3=>"G", \@c, -x4=>"C",
			 -filename => $filename,
			 -title => "Base Contents Around Start/Stop Codons"
			 );
	msg_gimv("graph/$filename") if($output eq "show");
    }elsif ($output eq "f"){
	open(OUT, '>data/' . $filename);
	print OUT "position,A,T,G,C\n";
	
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){
	    printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, 
	    $a[$i], $t[$i], $g[$i], $c[$i];
	}
	close(OUT);
    }
}


=head2 find_ori_ter

 Name: find_ori_ter   -   predict the replication origin and terminus in bacterial genomes

 Description:
    Predicts the replicational origin and terminus in circular bacterial genomes.

 Usage:
    (int origin, int terminus) = find_ori_ter(G instance);

 Options:
   -output    output toggle option (default: stdout)
   -purine    use purine skew for calculation (default: 0)
   -keto      use keto skew for calculation (default: 0)

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20060711-01 added purine and keto options
   20060707-01 calculation is now based on single bp resolution rather than with windows
              but now it is a lot more slower...
   20060221-01 speed up using Statistics::Descriptive
   20010905-01 options update
   20010326-01 initial posting

=cut


sub find_ori_ter {
    &opt_default(output=>"stdout", purine=>0, keto=>0);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = $gb->{SEQ};
    my $output = opt_val("output");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");

    &msg_send("\nfind_ori_ter:\n") if ($output eq 'stdout');

    if($purine){
	$seq =~ tr/atgcn/02021/;
    }elsif($keto){
	$seq =~ tr/atgcn/20021/;
    }else{
	$seq =~ tr/atgcn/11021/;
    }

    my (@data, $val, $i);
    for($i = 0; $i <= length($seq); $i ++){
	if(substr($seq, $i, 1) =~ /^\d$/){
	    $val += substr($seq, $i, 1) - 1;
	}
	push(@data, $val);
    }

    my $stat = Statistics::Descriptive::Full->new();
    $stat->add_data(@data);
    my $maxi = $stat->maxdex();
    my $mini = $stat->mindex();

    &msg_send("   Predicted Origin:   " , $maxi, "\n")  if ($output eq 'stdout');
    &msg_send("   Predicted Terminus: " , $mini, "\n\n") if ($output eq 'stdout');

    return ($maxi, $mini);
}






=head2 rep_ori_ter

 Name: rep_ori_ter   -   get the positions of replication origin and terminus

 Description:
    This method returns the replicational origin and terminus.
    If not known, the origin and terminus is predicted using
    find_ori_ter();

 Usage: 
    (int ori, int ter) = rep_ori_ter(G instance);

 Options:
   none

  Author: Kazuharu Gaou Arakawa

  History:
    20011030-01 initial posting

=cut


sub rep_ori_ter {
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my ($ori, $ter);
    my $id = $gb->{LOCUS}->{id};


    if(length $gb->{FEATURE0}->{terminus}){
	$ori = $gb->{FEATURE0}->{origin};
	$ter = $gb->{FEATURE0}->{terminus};
    }else{

	if ($id eq 'U00096' || $id eq 'NC_000913'){
	    #Escherichia coli K12
	    #Freeman et al 1998
	    $ori = 3923500 - 1; 
	    $ter = 1588800 - 1; 
	}elsif ($id eq 'AL009126' || $id eq 'NC_000964'){
	    #Bacillus subtilis
	    #Freeman et al 1998
	    $ori = 1 - 1;
	    $ter = 2017000 - 1;
	}elsif ($id eq 'L42023' || $id eq 'NC_000907'){
	    #Haemophilus influenzae
	    #Freeman et al 1998
	    $ori = 603000 - 1;
	    $ter = 1518000 - 1;
	}elsif ($id eq 'AL513382' || $id eq 'NC_003198'){
	    #Salmonella typhi
	    #Parkhill et al 2001
	    $ori = 3765000 - 1;
	    $ter = 1437000 - 1;
	}else{
	    ($ori, $ter) = &G::Seq::GCskew::find_ori_ter($gb, -output=>"/dev/null");
	}
	
	$gb->{FEATURE0}->{origin} = $ori;
	$gb->{FEATURE0}->{terminus} = $ter;
    }

    return ($ori, $ter);
}


=head2 leading_strand 

 Name: leading_strand   -   get the sequences of leading strands

 Description:
   This method returns the leading strands from origin and terminus 
   of replication calculated with rep_ori_ter()

 Usage: 
    (string seq1, string seq2) = &leading_strand(G instance);

 Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20011030-01 initial posting

=cut



sub leading_strand {
    my @args = opt_get(@_);
    my $gb = shift @args;
    my ($ori, $ter) = rep_ori_ter($gb);
    my ($seq1, $seq2);

    if ($ori > $ter){
	$seq1  = substr($gb->{SEQ}, $ori);
	$seq1 .= substr($gb->{SEQ}, 0, $ter);
	$seq2  = G::Seq::Util::_complement(substr($gb->{SEQ}, $ter, $ori - $ter));
    }else{
	$seq1 = substr($gb->{SEQ}, $ori, $ter - $ori);
	$seq2 = G::Seq::Util::_complement( substr($gb->{SEQ}, $ter) . substr($gb->{SEQ}, 0, $ori) );
    }
    
    return ($seq1, $seq2);
}


=head2 gcskew

 Name: gcskew   -   calculate the GC skew of the given genome

 Description:
   This program calculates and graphs the GC skew. 

 Usage: 
    array @gcskew = gcskew(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -purine      1 when observing purine (AG/TC) skew (default: 0)
   -keto        1 when observing keto (TG/AC) skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "gcskew.png" for -output=>"g",
                                          "gcskew.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20060711-01 added purine and keto skew
   20010905-01 update with options
   20010727-01 initial posting

=cut



sub gcskew {
    &opt_default(window=>10000, at=>0, purine=>0, keto=>0, output=>"show", 
		 application=>"gimv",filename=>"gcskew.png");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $ref = \$gb->{SEQ};
    my $window = opt_val("window");
    my $application = opt_val("application");
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $at = opt_val("at");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");
    my @gcskew = ();
    my @location = ();
    my $j;
    
    my $i = 0;
    while(length($$ref) - ($window * $i) >= $window){
	my ($g, $c);

	if($at){
	    $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	}elsif($purine){
	    $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $g += substr($$ref, $window * $i, $window) =~ tr/g/g/;
	    $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	    $c += substr($$ref, $window * $i, $window) =~ tr/c/c/;
	}elsif($keto){
	    $g = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	    $g += substr($$ref, $window * $i, $window) =~ tr/g/g/;
	    $c = substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $c += substr($$ref, $window * $i, $window) =~ tr/c/c/;
	}else{
	    $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
	    $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
	}

	if ($c+$g <= 0){
	    $gcskew[$i] = 0;
	}else{
	    $gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g));
	}

	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;

    my $title = "GC skew";
    if ($at){
	$title = "AT skew";
    }elsif($purine){
	$title = "Purine skew";
    }elsif($keto){
	$title = "Keto skew";
    }
    
    if ($output eq 'g' || $output eq 'show'){
	mkdir ("graph", 0777);

	_UniMultiGrapher(
			 \@location,\@gcskew,
			 -x=>"bp", -y=>$title,
			 -filename=>$filename,
			 -title=>$title,
			 -style=>"lines", -type=>"columns",
			 );
	msg_gimv("graph/" . $filename) if ($output eq 'show');
    }elsif ($output eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);

	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcskew;
}


=head2 cum_gcskew

 Name: cum_gcskew   -   calculate the cumulative GC skew of the given genome

 Description:
   This program calculates and graphs the cumulative GC skew. 

  Usage: array @cum_gcskew = gcskew(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -purine      1 when observing purine (AG/TC) skew (default: 0)
   -keto        1 when observing keto (TG/AC) skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "cum_gcskew.png" for -output=>"g",
                                          "cum_gcskew.csv" for -output=>"f")
   -application application to open png image (default: "gimv")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070429-01 minor bug fix related to output=>"f" option
   20060711-01 added purine and keto skew
   20010905-01 update with options
   20010727-01 initial posting

=cut



sub cum_gcskew {
    &opt_default(window=>10000, at=>0, purine=>0, keto=>0, output=>"show", 
		 application=>"gimv", filename=>"cum_gcskew.png");
    my @args = opt_get(@_);
    
    my $gb = opt_as_gb(shift @args);
    my $ref = \$gb->{SEQ};
    my $window = opt_val("window");
    my $application = opt_val("application");
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $at = opt_val("at");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");
    my @gcskew = ();
    my @location = ();
    my $j;
    my $tmp;
    
    my $i = 0;
    while(length($$ref) - ($window * $i) >= $window){
        my ($g, $c);

        if($at){
            $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
        }elsif($purine){
            $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
            $g += substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
            $c += substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }elsif($keto){
            $g = substr($$ref, $window * $i, $window) =~ tr/t/t/;
            $g += substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/a/a/;
            $c += substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }else{
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }

	if ($c+$g <= 0){
	    $tmp += 0;
	}else{
	    $tmp += sprintf("%.6f",($c-$g)/($c+$g));
	}
	$gcskew[$i] = $tmp;
	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;

    my $title = "Cumulative GC skew";
    if ($at){
        $title = "Cumulative AT skew";
    }elsif($purine){
        $title = "Cumulative Purine skew";
    }elsif($keto){
        $title = "Cumulative Keto skew";
    }
    
    if ($output eq 'g' || $output eq 'show'){
	mkdir ("graph", 0777);

	_UniMultiGrapher(
			 \@location,\@gcskew,
			 -x=>"bp", -y=>$title,
			 -filename=>$filename,
			 -title=>$title,
			 -style=>"lines", -type=>"columns",
			 );

	msg_gimv("graph/" . $filename) if ($output eq 'show');
    }elsif ($output eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);

	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcskew;
}



=head2 genomicskew

 Name: genomicskew   -   calculate the GC skew in different  regions of the given genome

 Description:
   This program graphs the GC skew for the whole genome, coding regions,
   intergenic regions, and the third codon.

 Usage: 
    1 = genomicskew(G instance);

 Options:
   -divide      window number to divide into (default: 250)
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "cum_gcskew.png" for -output=>"g",
                                          "cum_gcskew.csv" for -output=>"f")
   -application application to open png image (default: "gimv")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20040610-01 updated to handle introns and exons
   20040601-01 bug fix for output=>"f" option
   20010905-01 updated options
   20010727-01 initial posting

=cut


sub genomicskew {
    &opt_default(divide=>250, at=>0, output=>"show", application=>"gimv",
		  filename=>"genomicskew.png", intron=>0);
    my @args = opt_get(@_);

    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
	
    my $gb = opt_as_gb(shift @args);
    my $divide = opt_val("divide");
    my $opt = opt_val("output");
    my $application = opt_val("application");
    my $at = opt_val("at");
    my $intron = opt_val("intron");
    my (@gcskew, @betskew, @geneskew, @thirdskew);
    my @location = (0..$divide);
    my ($j, $window, $CDS, $BET, $THIRD);
    my $before = 0;
    my $i = 1;
    
    foreach my $cds ($gb->cds()){
	my $seq;

	if (length $gb->{$cds}->{join}){
	    foreach my $line (split(/,/, $gb->{$cds}->{join})){
		$line =~ tr/c//d;
		my ($start, $end) = split(/\.\./, $line, 2);
		$seq .= $gb->get_gbkseq($start, $end);
	    }
	}else{
	    $seq = $gb->get_gbkseq($gb->{$cds}->{start}, $gb->{$cds}->{end});
	}

	$CDS .= $seq;
	
	for($j = 2; $j <= length($seq); $j += 3){
	    if ($gb->{"$cds"}->{direction} eq 'complement'){
		$THIRD .= substr($seq, $j, 1);
	    }else{
		$THIRD .= substr($seq, $j - 2, 1);
	    }
	}

	$BET .= substr($gb->{SEQ}, $before, $gb->{"$cds"}->{start} - $before)
	    unless ($gb->{"$cds"}->{start} - $before < 1);

	$BET .= join('', $gb->get_intron($cds)) if($intron);

	$before = $gb->{"$cds"}->{end};
    }
    $i = 0;
    
    $window = int(length($gb->{SEQ}) / $divide);
    while($i <= $divide){
	my $g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/g/g/;
	$g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($gb->{SEQ}, $window * $i, $window) =~ tr/c/c/;
	$c = substr($gb->{SEQ}, $window * $i, $window) =~ tr/t/t/ if ($at);
	$gcskew[$i] = 0;
	$gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($CDS) / $divide);
    while($i <= $divide){
	my $g = substr($CDS, $window * $i, $window) =~ tr/g/g/;
	$g = substr($CDS, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($CDS, $window * $i, $window) =~ tr/c/c/;
	$c = substr($CDS, $window * $i, $window) =~ tr/t/t/ if ($at);
	$geneskew[$i] = 0;
	$geneskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($BET) / $divide);
    while($i <= $divide){
	my $g = substr($BET, $window * $i, $window) =~ tr/g/g/;
	$g = substr($BET, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($BET, $window * $i, $window) =~ tr/c/c/;
	$c = substr($BET, $window * $i, $window) =~ tr/t/t/ if ($at);
	$betskew[$i] = 0;
	$betskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($THIRD) / $divide);
    while($i <= $divide){
	my $g = substr($THIRD, $window * $i, $window) =~ tr/g/g/;
	$g = substr($THIRD, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($THIRD, $window * $i, $window) =~ tr/c/c/;
	$c = substr($THIRD, $window * $i, $window) =~ tr/t/t/ if ($at);
	$thirdskew[$i] = 0;
	$thirdskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }

    my $title = "GC skew";
    $title = "AT skew" if ($at);
    
    if ($opt eq "show" || $opt eq "g"){
	mkdir ("graph", 0777);
	_UniMultiGrapher(
			 \@location, 
			 -x=>"bp", -y=>$title,
			 \@gcskew, -x1=>"whole genome",
			 \@geneskew, -x2=>"coding region",
			 \@betskew, -x3=>"intergenic region",
			 \@thirdskew, -x4=>"codon third position",
			 -style=>"lines", -type=>"columns",
			 -filename=>$filename,
			 -title=>$title
			 );

	msg_gimv("graph/" . $filename) if ($opt eq 'show');
    }elsif ($opt eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);
	open(OUT, ">data/" . $filename);
	print OUT "location,$title,coding,intergenic,third codon\n";
	for ($j = 0; $j <= $divide; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], ",", $geneskew[$j], ",",
	    $betskew[$j], ",", $thirdskew[$j], ",", "\n";
	}
	close(OUT);
    }
    
    return 1;
}


=head2 gcwin

 Name: gcwin   -   calculate the GC content along the given genome

 Description:
   This program calculates and graphs the GC content.

 Usage: 
    array @gcwin = gcwin(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -at          1 when observing AT content instead of GC content (default: 0)
   -purine      1 when observing purines (AG) skew (default: 0)
   -keto        1 when observing ketos (TG) skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "gcwin.png" for -output=>"g",
                                          "gcwin.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070429-01 minor bug fix related to output=>"f" option
   20060711-01 added purine and keto options
   20010905-01 updated options
   20010729-01 initial posting

=cut




sub gcwin  {
    &opt_default(window=>10000, at=>0, purine=>0, keto=>0, output=>"show", 
		 application=>"gimv", filename=>"gcwin.png");
    my @args = opt_get(@_);

    my $gb = opt_as_gb(shift @args);
    my $ref = \$gb->{SEQ};
    my $window = opt_val("window");
    my $at = opt_val("at");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");
    my $application = opt_val("application");
    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $opt = opt_val("output");
    my (@gcwin, @location);
    my $j;
    
    my $i = 0;
    my ($g, $c);
    while(length($$ref) - ($window * $i) >= $window){
        if($at){
            $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
        }elsif($purine){
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/a/a/;
        }elsif($keto){
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	}else{
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }

	$gcwin[$i] = sprintf("%.6f",($g+$c)/$window);
	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;

    my $title = "GC content";
    if ($at){
        $title = "AT content";
    }elsif($purine){
        $title = "Purine content";
    }elsif($keto){
        $title = "Keto content";
    }
        
    if ($opt eq 'g' || $opt eq 'show'){
	mkdir ("graph", 0777);
	_UniMultiGrapher(
			 \@location, \@gcwin,
			 -x=>"bp", -y=>$title, 
			 -filename=>$filename,
			 -title=>$title, -style=>"lines", -type=>"columns"
			 );
	msg_gimv("graph/" . $filename)
	    if ($opt eq 'show');;
    }elsif ($opt eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);
	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcwin[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcwin;
}



=head2 query_strand 

  Name: query_strand   -   get the strand name (leading or lagging) from the given position

  Description:
    Given a position and strand information (direct or complement), 
    returns whether the specified position is in the leading or lagging strand.

  Usage: 
    string strand = query_strand(G instance, int position);

    or

    string strand = query_strand(G instance, CDS/FEATURE id);

 Options: 
    -direction    strand of the querying position, either 'direct' or 'complement'
                  (default: direct)

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20020218-01 initial posting
    20070106-01 added CDS option

=cut




sub query_strand {
    opt_default(direction=>'direct');
    my @args = opt_get(@_);
    my $gb = shift @args;
    my $pos = shift @args;
    my $direction = opt_val("direction");

    if($pos =~ /^FEATURE/ || /^CDS/){
	$direction = $gb->{$pos}->{direction};
	$pos = $gb->{$pos}->{start};
    }

    my ($ori, $ter) = rep_ori_ter($gb);

    if ($ori > $ter){
	if ($pos < $ter || $pos > $ori){
	    if ($direction eq 'complement'){
		return ("lagging");
	    }else{
		return ("leading");
	    }
	}else{
	    if ($direction eq 'complement'){
		return ("leading");
	    }else{
		return ("lagging");
	    }
	}
    }else{
	if ($pos > $ori && $pos < $ter){
	    if ($direction eq 'complement'){
		return ("lagging");
	    }else{
		return ("leading");
	    }
	}else{
	    if ($direction eq 'complement'){
		return ("leading");
	    }else{
		return ("lagging");
	    }
	}
    }
}






=head2 query_arm

  Name: query_arm   -   get the replication arm name (left or right) from the given position

  Description:
    Given a position, returns whether the specified position is in the 
    left or right arm of the circular chromosome.

  Usage: 
    string arm = query_arm(G instance, int position);

  Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070112-01 initial posting

=cut



sub query_arm{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $pos = shift @args;

    if($pos =~ /^FEATURE/ || /^CDS/){
	$pos = $gb->{$pos}->{start};
    }
    
    my ($ori, $ter) = rep_ori_ter($gb);

    if($ori < $ter){
	if($pos <= $ori){
	    return 'left';
	}elsif($pos >= $ori && $pos <= $ter){
	    return 'right';
	}elsif($pos >= $ter){
	    return 'left';
	}
    }else{
	if($pos < $ter){
	    return 'right';
	}elsif($pos >= $ter && $pos <= $ori){
	    return 'left';
	}elsif($pos >= $ori){
	    return 'right';
	}
    }
}


=head2 set_strand 

  Name: set_strand   -   set replication strand and arm information to given G instance

  Description:
    Sets $gb->{$cds}->{strand} and $gb->{$cds}->{arm} using
    query_strand() and query_arm(), indicating in which strand
    or replication arm the gene resides.

  Usage: 
    1 = set_strand($gb)

 Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070112-01 initial posting

=cut

sub set_strand{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    foreach my $cds ($gb->cds()){
	$gb->{$cds}->{strand} = query_strand($gb, $cds);
	$gb->{$cds}->{arm} = query_arm($gb, $cds);
    }

    return 1;
}




=head2 set_gc3

  Name: set_gc3   -   set GC content in 3rd codon position of all genes

  Description:
    Sets $gb->{$cds}->{gc3}, GC content in 3rd codon position.
    Value is in decimal (eg. 0.56345).

  Usage: 
    1 = set_gc3($gb)

 Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070116-01 initial posting

=cut



sub set_gc3{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    foreach my $cds ($gb->cds()){
	my $geneseq = $gb->get_geneseq($cds);

	my (%gc3, $tot);

	my $i = 0;
	for($i = 2; $i < length $geneseq; $i += 3){
	    $gc3{substr($geneseq, $i, 1)}++;
	    $tot ++;
	}

	$gb->{$cds}->{gc3} = ($gc3{g} + $gc3{c})/$tot;
    }

    return 1;
}





=head2 genes_from_ori

 Name: genes_from_ori   -   get a list of CDS IDs ordered in the distance from origin of replication

 Description:
   This program lists genes in order relative to the position of 
   replication origin in either right or left half of the bacterial
   chromosomes.

  Usage: array @genes = genes_from_ori(G instance, "right");

 Options:
   Second argument should be eighter "right" or "left" to indicate
   the interested half of the bacterial chromosome. If omitted, 
   returns list of genes on both arms in the order of distance
   from replication origin.

 Note:
   Origin and terminus of replication is obtained from rep_ori_ter()

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070106-01 initial posting
   20070112-01 modified the handling of second argument
=cut



sub genes_from_ori{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $wing = shift @args;

    my ($ori, $ter) = rep_ori_ter($gb);

    my (@sectionA, @sectionB, @sectionC, @left, @right);
    
    if($ori < $ter){
	foreach my $cds ($gb->cds()){

	    if($gb->{$cds}->{start} <= $ori){
		push(@sectionA, $cds);
	    }elsif($gb->{$cds}->{start} >= $ori && $gb->{$cds}->{start} <= $ter){
		push(@sectionB, $cds);
	    }elsif($gb->{$cds}->{start} >= $ter){
		push(@sectionC, $cds);
	    }else{
		warn("Something is wrong at G::Seq::GCskew::genes_from_ori()");
	    }
	}

	@left = (reverse(@sectionA), reverse(@sectionC));
	@right = @sectionB;

    }else{
	foreach my $cds ($gb->cds()){

	    if($gb->{$cds}->{start} < $ter){
		push(@sectionA, $cds);
	    }elsif($gb->{$cds}->{start} >= $ter && $gb->{$cds}->{start} <= $ori){
		push(@sectionB, $cds);
	    }elsif($gb->{$cds}->{start} >= $ori){
		push(@sectionC, $cds);
	    }else{
		warn("Something is wrong at G::Seq::GCskew::genes_from_ori()");
	    }
	}
	
	@left = (reverse(@sectionB));
	@right = (@sectionC, @sectionA);
	
    }

    if(lc($wing) =~ /l/){
	return @left;
    }elsif(lc($wing) =~ /r/){
	return @right;
    }else{
	my %hash;
	foreach my $cds ($gb->cds()){
	    $hash{$cds} = dist_in_cc($gb, $gb->{$cds}->{start});
	}

	my @all = sort{ $hash{$a} <=> $hash{$b}} keys %hash;
	return @all;
    }
}




=head2 dist_in_cc

 Name: dist_in_cc   -   calculates the distance between two loci in circular chromosomes

 Description:
   This program calculates the distance between two loci in 
   circular chromosomes, mostly useful to calculate the
   distance from the replication origin.

  Usage: int distance = dist_in_cc(G instance, int position1, int position2);

 Options:
   If the second position is not given, position of replication origin is used.

 Note:
   Origin and terminus of replication is obtained from rep_ori_ter()

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070112-01 initial posting
=cut



sub dist_in_cc{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $first = shift @args;
    my $second = shift @args;

    $first = $gb->{$first}->{start} if($first =~ /^FEATURE/ || /^CDS/);
    $second = $gb->{$second}->{start} if($second =~ /^FEATURE/ || /^CDS/);

    unless(length($second)){
	my ($ori, $ter) = rep_ori_ter($gb);
	$second = $ori;
    }

    my @dist;

    $dist[0] = abs($first - $second);
    $dist[1] = abs($first + length($gb->{SEQ}) - $second);
    $dist[2] = abs($first - (length($gb->{SEQ}) + $second));
    
    my @new = sort {$a <=> $b} @dist;

    return shift @new;
}




1;
