#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: GCskew.pm,v 1.3 2002/08/16 15:03:36 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::GCskew;

use SubOpt;
use Rcmd;
use G::Messenger;
use G::Tools::Graph;
use G::Tools::Statistics;
use G::Seq::Primitive;
use G::Seq::Codon;
use G::Seq::Util;
use autouse 'Algorithm::Numerical::Shuffle'=>qw(shuffle);
use File::ShareDir ':ALL';
use File::Temp;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
	     find_ori_ter
	     gcskew
	     gcsi
	     cum_gcskew
	     genomicskew
	     geneskew
	     gcwin
	     coding_density
	     leading_strand
	     query_strand
	     query_arm
	     set_strand
	     set_gc3
	     genes_from_ori
	     rep_ori_ter
	     dist_in_cc
	     lda_bias
	     B1
	     B2
	     delta_gcskew
);


__DATA__

#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

    G::Seq::GCskew - Analysis methods related to GC skew and genomic strand bias

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to GC skew.

=cut



#:::::::::::::::::::::::::::::::::
#       Let the code begin...
#:::::::::::::::::::::::::::::::::




=head2 find_ori_ter

 Name: find_ori_ter   -   predict the replication origin and terminus in bacterial genomes

 Description:
    Predicts the replicational origin and terminus in circular bacterial genomes,
    by taking the vertices of cumulative skew graphs (GC, keto, or purine). See
    Reference 1 for the basic idea behind thid algorithm (but also note that this
    algorithm is different from that of Oriloc, which uses GC3 of genes). 

    Terminus of replication can be more accurate by using noise-reduction 
    filtering using Fourier spectrum of the GC skew. This low-pass filtering
    can be applied using -filter option. See Reference 2 for details.

 Usage:
    ($origin, $terminus) = find_ori_ter($genome);

 Options:
   -output    output toggle option (default: stdout)
   -purine    use purine skew for calculation (default: 0)
   -keto      use keto skew for calculation (default: 0)
   -filter    lowpass filter strength in percent. typically 95 or 99 works best. (default: NULL)
   -window    number of windows to use for Fast Fourier Transform. only active 
              when -filter option is specified. value must be the power of two. (default: 4096)

 References:
   1. Frank AC, Lobry JR (2000) "Oriloc: prediction of replication boundaries in unannotated 
      bacterial chromosomes", Bioinformatics, 16:566-567.
   2. Arakawa K, Saito R, Tomita M (2007) "Noise-reduction filtering for accurate detection 
      of replication termini in bacterial genomes", FEBS Letters, 581(2):253-258.

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20071023-01 speed up by combining rough/detailed searches
   20070707-01 added -filter option
   20060711-01 added purine and keto options
   20060707-01 calculation is now based on single bp resolution rather than with windows
               but now it is a lot more slower...
   20060221-01 speed up using Statistics::Descriptive
   20010905-01 options update
   20010326-01 initial posting

=cut


sub find_ori_ter {
    require Statistics::Descriptive;
    &opt_default(output=>"stdout", purine=>0, keto=>0, window=>4096);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $len = length($gb->{SEQ});
    my $output = opt_val("output");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");
    my $lowpass = opt_val("filter");
    my $power = opt_val("window");

    &msg_send("\nfind_ori_ter:\n") if ($output eq 'stdout');

    my ($max, $min);
    if(length $lowpass){
	require Math::FFT;
	my $window = int($len / $power);
	my @gcskew = gcskew($gb, -output=>"/dev/null", -window=>$window, -purine=>$purine, -keto=>$keto);

	while(scalar @gcskew > $power){
	    pop @gcskew;
	}

	my $fft = new Math::FFT([@gcskew]);
	my $coeff = $fft->rdft();
	my $coeff2 = [@{$coeff}];
	my $spctrm = $fft->spctrm();
	my $j;
	my $power2 = int($power/100);
	$power2 ++ if ($power2 %2 == 1);

	for($j = $power2 * (100 - $lowpass) - 1; $j < $power; $j ++){
	    $coeff->[$j] = 0;
	}
    
	my $orig = $fft->invrdft($coeff);

	my (@cum, $tmp);
	foreach my $value (@{$orig}){
	    $tmp += $value;
	    push(@cum, $tmp);
	}

	my $stat = Statistics::Descriptive::Full->new();
	$stat->add_data(@cum);
	my $maxi = $stat->maxdex();
	my $mini = $stat->mindex();

	$max = ($maxi + 1) * $window - int($window/2);
	$min = ($mini + 1) * $window - int($window/2);
    }else{
	local *peak_search = sub{
	    my $seq = shift;

	    if($purine){
		$seq =~ tr/atgcn/02021/;
	    }elsif($keto){
		$seq =~ tr/atgcn/20021/;
	    }else{
		$seq =~ tr/atgcn/11021/;
	    }
	    
	    my (@data, $val, $i);
	    for($i = 0; $i <= length($seq); $i ++){
		if(substr($seq, $i, 1) =~ /^\d$/){
		    $val += substr($seq, $i, 1) - 1;
		}
		push(@data, $val);
	    }

	    my $stat = Statistics::Descriptive::Full->new();
	    $stat->add_data(@data);
	    return ($stat->maxdex(), $min = $stat->mindex());
	};

	if($len > 100000){
	    my @cumgcskew = gcskew($gb, -output=>"/dev/null", -window=>10000, 
				   -purine=>$purine, -keto=>$keto, -cumulative=>1);

	    my $stat = Statistics::Descriptive::Full->new();
	    $stat->add_data(@cumgcskew);
	    my $maxi = $stat->maxdex();
	    my $mini = $stat->mindex();

	    ($max, undef) = peak_search(substr($gb->{SEQ}, $maxi * 10000, 20000));
	    (undef, $min) = peak_search(substr($gb->{SEQ}, $mini * 10000, 20000));
	    $max += $maxi * 10000;
	    $min += $mini * 10000;
	    $max = 0 if ($max < $len / 1000 || abs($max - $len) < $len / 1000);
	}else{
	    ($max, $min) = peak_search($gb->{SEQ});
	}
    }

    &msg_send("   Predicted Origin:   " , $max, "\n")  if ($output eq 'stdout');
    &msg_send("   Predicted Terminus: " , $min, "\n\n") if ($output eq 'stdout');

    return ($max, $min);
}






=head2 rep_ori_ter

 Name: rep_ori_ter   -   get the positions of replication origin and terminus

 Description:
    This method returns the positions of replication origin and terminus
    in bacterial genomes by several means. 

    1. Use of databases
      By default, rep_ori_ter() tries to retrieve the position of replication
      origin in DoriC (Reference 1) database, and the position of replication
      terminus from the supplemental data provided in Reference 2.

      If the position of origin cannot be found in the database, but "rep_origin" 
      feature is available, center position within this feature is used for origin. 

    2. Oriloc
      Using -oriloc=>1 option, you can predict the replication origin and 
      terminus using the popular Oriloc program developed by Lobry et al. 
      available as part of the SeqinR package (Reference 3).

      Installation of SeqinR package is required to use this feature.
      Type: install.packages('seqinr', dep=T)
      in R statistics package to install SeqinR.

    3. Setting custom origin and terminus
      Use -ori=>$position1, -ter=>$position2 to set your own positions.

    4. Use GC skew shift-point
       If the positions of origin or terminus cannot be found in the databases,
       rep_ori_ter() automatically calls find_ori_ter() method to predict the 
       positions using GC skew shift-points at one-base-pair resolution.

    After calling this function, positions of origin and terminus are stored 
    as follows:
      $genome->{FEATURE0}->{origin}
      $genome->{FEATURE0}->{terminus}

    You can always use -clear=>1 option to disable the use of above cache.

 Usage: 
    ($ori, $ter) = rep_ori_ter($genome);

 References:
   1. Gao F and Zhang CT (2007) "DoriC: a database of oriC regions in bacterial
      genomes", Bioinformatics, 23(14):1866-1867
   2. Kono N et al. (2011) "Comprehensive prediction of chromosome dimer resolution 
      sites in bacterial genomes", BMC Genomics, 12(1):19
   3. Frank AC and Lobry JR (2000) "Oriloc: prediction of replication boundaries 
      in unannotated bacterial chromosomes", Bioinformatics, 16(6):560-561

 Options:
   -oriloc            set to 1 to use Oriloc for prediction (default: 0)
   -gcskew            set to 1 to use GC skew shift-point for prediction (default: 0)
   -dif-threshold     distance between the GC skew shift point and predicted dif site
                      expressed as the percentage of genome size, used as a threshold
                      to retrieve dif sequence from the database (default: 5)
   -dbonly            set to 1 to only use values available in databases and 
                      to suppress prediction
   -clear             set to 1 to disable the use of cache

 Author: Kazuharu Arakawa

 History:
    20110223-01 major revision. added support for DoriC and dif databases,
                Oriloc (SeqinR), and an option -gcskew
    20080428-01 added support for "rep_origin" feature
    20011030-01 initial posting

=cut


sub rep_ori_ter {
    opt_default('dif-threshold'=>5, 'gcskew'=>0, 'clear'=>0, 'oriloc'=>0, 'dbonly'=>0);
    my @args = opt_get(@_);
    my $gb   = opt_as_gb(shift @args);
    my %opt  = opt_val();
    my $id   = $gb->{LOCUS}->{id};
    my ($ori, $ter);

    if($opt{'clear'}){
	delete($gb->{FEATURE0}->{origin});
	delete($gb->{FEATURE0}->{terminus});
    }

    if(length($opt{'ori'}) || length($opt{'ter'})){
	$ori = $opt{'ori'} if (length($opt{'ori'}));
	$ter = $opt{'ter'} if (length($opt{'ter'}));
	msg_error("Using ori: $ori and ter: $ter\n");

	$gb->{FEATURE0}->{origin} = $ori;
	$gb->{FEATURE0}->{terminus} = $ter;
    }elsif(length $gb->{FEATURE0}->{terminus}){
	$ori = $gb->{FEATURE0}->{origin};
	$ter = $gb->{FEATURE0}->{terminus};
    }elsif($opt{'oriloc'}){
	msg_error("Using Oriloc in SeqinR package to predict...\n");
	my $gbk = '/tmp/' . time() . rand() . '.gbk';
	my $rcmd = new Rcmd();
	$rcmd->set_mode('silent');
	$gb->output($gbk);

	($ori, $ter) = $rcmd->exec(qq|
				   library(seqinr);
				   x = oriloc(gbk='$gbk')
				   c(x[which(x[,5]==max(x[,5])),2] * 1000, x[which(x[,5]==min(x[,5])),2] * 1000)
				   |);

	unlink($gbk);

	msg_error("\nPredicted ori: $ori and ter: $ter\n");

	$gb->{FEATURE0}->{origin} = $ori;
	$gb->{FEATURE0}->{terminus} = $ter;
    }else{
	unless($opt{'gcskew'}){
	    my @oris;
	    for my $line (readFile("http://tubic.tju.edu.cn/doric/query.php?selfield=oa&term=$id")){
		if($line =~ /(information.php\?ac\=ORI\d+)/){
		    my $flag = 0;
		    for my $ll (readFile("http://tubic.tju.edu.cn/doric/$1")){
			$flag = 1 if ($ll =~ /The location of oriC region/);

			if($flag && $ll =~ /(\d+)\.\.(\d+) nt/){
			    push(@oris, $1, $2);
			    last;
			}
		    }
		}
	    }

	    if(scalar(@oris)){
		$ori = $oris[int(scalar(@oris)/2)];
		msg_error("oriC found in DoriC. Using $ori as origin.\n");
	    }

	    my $filename = dist_file('g-language', 'data/dif.tab');

	    my $data = {};
	    open(FILE, $filename);
	    while(<FILE>){
		chomp;
		my @line = split(/\t/, $_);
		next unless($line[0] =~ /^NC_/);
		
		$data->{$line[0]}->{'delta'} = $line[4];
		$data->{$line[0]}->{'dif'}   = $line[5];
	    }
	    close(FILE);
	    
	    if(length($data->{$id}) && $data->{$id}->{'delta'} <= $opt{'dif-threshold'} * 3.6){
		$ter = $data->{$id}->{'dif'};
		msg_error("dif sequence found in database. Using $ter as terminus.\n");
	    }
	    
	    unless($ori){
		my $interface = msg_ask_interface();
		msg_interface('NULL');
		my $cds = ($gb->find(-type=>'rep_origin'))[0];
		if($cds){
		    $ori = int(($gb->{$cds}->{end} + $gb->{$cds}->{start})/2) - 1;
		    msg_error("rep_origin feature found. Using $ori as origin.\n");
		}
		msg_interface($interface);
	    }
	}

	unless($opt{'dbonly'}){
	    if(length($ori) < 1 || length($ter) < 1){
		msg_error('Using GC skew shift-point [find_ori_ter()] to predict...' . "\n");
		my ($ori2, $ter2) = &G::Seq::GCskew::find_ori_ter($gb, -output=>"/dev/null");
		$ori = $ori2 unless(length($ori));
		$ter = $ter2 unless(length($ter));
	    }
	    
	    $gb->{FEATURE0}->{origin} = $ori;
	    $gb->{FEATURE0}->{terminus} = $ter;
	}
    }

    return ($ori, $ter);
}




=head2 leading_strand

 Name: leading_strand   -   get the sequences of leading strands

 Description:
   This method returns the leading strands from origin and terminus 
   of replication calculated with rep_ori_ter().

   When called in array context, this method returns the sequences
   of the two replication arms. In scalar context, the sequences of
   the two arms are concatenated and returned as one sequence.

 Usage: 
    #in array context
    ($arm1, $arm2) = leading_strand($genome);

    #in scalar context
    string $leadingStrand = leading_strand($genome);

 Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20071022-01 added array/scalar contexts
    20011030-01 initial posting

=cut



sub leading_strand {
    my @args = opt_get(@_);
    my $gb = shift @args;
    my ($ori, $ter) = rep_ori_ter($gb);
    my ($seq1, $seq2);

    if ($ori > $ter){
	$seq1  = substr($gb->{SEQ}, $ori);
	$seq1 .= substr($gb->{SEQ}, 0, $ter);
	$seq2  = complement(substr($gb->{SEQ}, $ter, $ori - $ter));
    }else{
	$seq1 = substr($gb->{SEQ}, $ori, $ter - $ori);
	$seq2 = complement( substr($gb->{SEQ}, $ter) . substr($gb->{SEQ}, 0, $ori) );
    }
    
    if(wantarray()){
	return ($seq1, $seq2);
    }else{
	return $seq1 . $seq2;
    }
}





=head2 gcsi

  Name: gcsi   -   GC Skew Index: an index for strand-specific mutational bias

  Description:
    This program calculates the GC Skew Index (GCSI) of the given circular
    bacterial genome. GCSI quantifies the degree of GC Skew. In other words,
    this index represents the degree of strand-specific mutational bias in 
    bacterial genomes, caused by replicational selection. 

    GCSI is calculated by the following formula:
        GCSI = sqrt((SA/6000) * (dist/600))
    where SA is the spectral amplitude of Fourier power spectrum at 1Hz,
    and dist is the normalized Euclidean distance between the vertices of 
    cumulative GC skew.

    GCSI ranges from 0 (no observable skew) to 1 (strong skew), and Archaeal genomes
    that have multiple replication origins and therefore have no observable skew
    mostly have GCSI below 0.05. Escherichia coli genome has values around 0.10.

    Version 1 of GCSI required fixed number of windows (4096), but the new GCSI
    version 2 (also known as generalized GCSI: gGCSI) is invariant of the number
    of windows. GCSI version 1 is calculated as an arithmetic mean (as opposed to 
    the geometric mean of gGCSI) of SR (spectral ratio, the signal-to-noise ratio
    of 1Hz power spectrum) and dist. 

  Usage: 
    $gcsi = gcsi($genome);               # scalar context
       or
    ($gcsi, $sa, $dist) = gcsi($genome); # array context

  Options:
   -version     version of GCSI. generalized GCSI is selected by default. (default: 2)
   -window      number of windows. must be a power of 2. (default: 4096)
   -purine      use purine skew for calculation (default: 0)
   -keto        use keto skew for calculation (default: 0)
   -at          use AT skew for calculation (default: 0)
   -p           calculate p-value when GCSI version 2 is selected (default: 0)
                returned values are ($gcsi, $sa, $dist, $z, $p)

  References:
    1. Arakawa K, Tomita M (2007) "The GC skew index: a measure of genomic compositional 
       asymmetry and the degree of replicational selection", Evolutionary Bioinformatics, 3:145-154.
    2. Arakawa K, et al. (2009) "Quantitative analysis of replication-related mutation
       and selection pressures in bacterial chromosomes and plasmids using generalised GC skew index",
       BMC Genomics, 10:640.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
   20090311-01 updated to version 2, with -p and -version options
   20080421-01 added -at, -purine and -keto options
   20070313-01 added error message for when genome size is too small
   20070707-01 initial posting

=cut


sub gcsi{
    require Math::FFT;

    opt_default(window=>4096, purine=>0, keto=>0, at=>0, version=>2, p=>0);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my %opt = opt_val();

    my $window = int(length($gb->{SEQ}) / $opt{'window'});
    if($window < 10){
	die("Error in gcsi: number of windows is too large, or the genome is too small.\n" . 
	    "GCSI would not be accurate for window size less than 100 bp.\n");
    }
    my @gcskew = gcskew($gb, -output=>'/dev/null', -window=>$window, -purine=>$opt{'purine'}, -keto=>$opt{'keto'}, -at=>$opt{'at'});

    while(scalar @gcskew > $opt{'window'}){
	pop @gcskew;
    }

    local *gcsicore = sub{
	my @gcskew = @_;
	my @cumgcskew = cumulative(@gcskew);

	my $fft = new Math::FFT([@gcskew]);
	my @spectrum = @{$fft->spctrm()};
	shift @spectrum;
	my $first = shift @spectrum;

	my $dist = abs(max(@cumgcskew)) + abs(min(@cumgcskew));
	my ($gcsi, $sr);

	if($opt{'version'} == 1){
	    $sr = $first/mean(@spectrum);
	    $gcsi = ($sr/6000 + $dist/600)/2;
	}else{
	    $dist *= 4096/$opt{'window'};
	    $sr = 40 * (($first * 6000 * 100) ** 0.4);
	    $gcsi = sqrt($sr/6000 * $dist/600);
	}

	return ($gcsi, $sr, $dist);
    };

    my ($gcsi, $sr, $dist) = gcsicore(@gcskew);

    if($opt{'p'}){
	require Statistics::Distributions;

	my @random;
	for (1..100){
	    my ($rgcsi, undef, undef) = gcsicore(shuffle(@gcskew));
	    push(@random, $rgcsi);
	}

	my $z = abs(($gcsi - mean(@random))/standard_deviation(@random));
	my $p = Statistics::Distributions::uprob($z) * 2;
#	my $rcmd = new Rcmd();
#	say $rcmd->normtest(@random);
	return ($gcsi, $sr, $dist, $z, $p);
    }

    if(wantarray()){
	return ($gcsi, $sr, $dist);
    }else{
	return $gcsi;
    }
		
}







=head2 gcskew

 Name: gcskew   -   calculate the GC skew of the given genome

 Description:
   This program calculates and graphs the GC skew. 

 Usage: 
    array @gcskew = gcskew(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -slide       window slide size (default: same as window size)
   -cumulative  1 to calculate cumulative skew (default: 0)
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -purine      1 when observing purine (AG/TC) skew (default: 0)
   -keto        1 when observing keto (TG/AC) skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "gcskew.png" for -output=>"g",
                                          "gcskew.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20090601-01 added -slide option
   20070707-01 added -cumulative option
   20060711-01 added purine and keto skew
   20010905-01 update with options
   20010727-01 initial posting

=cut



sub gcskew {
    &opt_default(window=>10000, at=>0, purine=>0, keto=>0, output=>"show", 
		 filename=>"gcskew.png", cumulative=>0, slide=>undef);
    my @args       = opt_get(@_);
    my $gb         = opt_as_gb(shift @args);
    my $ref        = \$gb->{SEQ};
    my $len        = length($$ref);
    my $window     = opt_val("window");
    my $slide      = opt_val("slide") || $window;
    die('Error at gcskew(): window size too small') if ($window < 10);
    my $output     = opt_val("output");
    my $cumulative = '';
    $cumulative    = "Cumulative " if (opt_val("cumulative"));
    my $filename   = opt_val("filename");
    $filename      =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $at         = opt_val("at");
    my $purine     = opt_val("purine");
    my $keto       = opt_val("keto");
    my @gcskew     = ();
    my @location   = ();
    my ($tmp, $pos, $j, $i) = (0,0,0,0);

    while($len - $pos >= $window){
	my ($g, $c);

	if($at){
	    $g =  substr($$ref, $pos, $window) =~ tr/a/a/;
	    $c =  substr($$ref, $pos, $window) =~ tr/t/t/;
	}elsif($purine){
	    $g =  substr($$ref, $pos, $window) =~ tr/a/a/;
	    $g += substr($$ref, $pos, $window) =~ tr/g/g/;
	    $c =  substr($$ref, $pos, $window) =~ tr/t/t/;
	    $c += substr($$ref, $pos, $window) =~ tr/c/c/;
	}elsif($keto){
	    $g =  substr($$ref, $pos, $window) =~ tr/t/t/;
	    $g += substr($$ref, $pos, $window) =~ tr/g/g/;
	    $c =  substr($$ref, $pos, $window) =~ tr/a/a/;
	    $c += substr($$ref, $pos, $window) =~ tr/c/c/;
	}else{
	    $g =  substr($$ref, $pos, $window) =~ tr/g/g/;
	    $c =  substr($$ref, $pos, $window) =~ tr/c/c/;
	}

	if(length($cumulative)){
	    if ($c+$g <= 0){
		$tmp += 0;
	    }else{
		$tmp += sprintf("%.6f",($c-$g)/($c+$g));
	    }
	    $gcskew[$i] = $tmp;
	}else{
	    if ($c+$g <= 0){
		$gcskew[$i] = 0;
	    }else{
		$gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g));
	    }
	}

	$location[$i] = $pos;
	$pos += $slide;
	$i ++;
    }
    $i --;

    my $title = $cumulative . "GC skew";
    if ($at){
	$title = $cumulative . "AT skew";
    }elsif($purine){
	$title = $cumulative . "Purine skew";
    }elsif($keto){
	$title = $cumulative . "Keto skew";
    }

    if ($output eq 'g' || $output eq 'show'){
	mkdir ("graph", 0777);

	_UniMultiGrapher(
			 \@location,\@gcskew,
			 -x=>"bp", -y=>$title,
			 -filename=>$filename,
			 -title=>$title,
			 -style=>"lines", -type=>"columns",
			 );

	msg_gimv("graph/" . $filename) if ($output eq 'show');
    }elsif ($output eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);

	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcskew;
}


sub cum_gcskew {
    msg_error("WARNING: cum_gcskew is deprecated since v.1.6.13.\n" .
	      "         This method will be removed in future releases.\n" .
	      "         use gcskew(\$gb, -cumulative=>1) instead!\n\n");

    return gcskew(@_, -cumulative=>1);
}



=head2 geneskew

 Name: geneskew   -   calculate the gene strand bias of the given genome

 Description:
   This program calculates and graphs the strand bias of genes (or the GC skew within them).
   By default, this program visualizes the gene strand preference (1 for direct, -1 for complement strand),
   but by specifying -base option  option, GC/AT/Purine/Keto skews of the coding regions
   or more specifically in the GC3 (third codon position) with -gc3 option can be calculated.

 Usage: 
    array @geneskew = geneskew(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -slide       window slide size (default: same as window size)
   -cumulative  1 to calculate cumulative skew (default: 0)
   -base        'gc', 'at', 'purine', or 'keto' for observing GC/AT/Purine/Keto skews (default: none)
   -gc3         use only the third codon positions.
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "gcskew.png" for -output=>"g",
                                          "gcskew.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20110216-01 initial posting

=cut



sub geneskew {
    &opt_default(output=>"show", filename=>"geneskew.png", cumulative=>0);
    my @args       = opt_get(@_);
    my $gb         = opt_as_gb(shift @args);
    my %opts       = opt_val();
    my @skew       = ();
    my @location   = ();

    if($opts{'gc3'} && length($opts{'base'}) < 1){
	$opts{'base'} = 'gc';
    }

    for my $cds ($gb->cds()){
	push(@location, $gb->{$cds}->{start});

	if($opts{'base'}){
	    my $geneseq = $gb->get_geneseq($cds);

	    if($opts{'gc3'}){
		my $seq = '';
		my $i = 0;
		for($i = 2; $i < length $geneseq; $i += 3){
		    $seq .= substr($geneseq, $i, 1);
		}
		$geneseq = $seq;
	    }
	    $geneseq = complement($geneseq) if ($gb->{$cds}->{direction} eq 'complement');

	    my $type = '-' . $opts{'base'};
	    my @gcskew = gcskew($geneseq, -window=>length($geneseq), -output=>"null", $type=>1);
	    push(@skew, $gcskew[0]);

	}else{
	    push(@skew, $gb->{$cds}->{direction} eq 'direct' ? 1:-1);
	}
    }
    @skew = cumulative(@skew) if ($opts{'cumulative'});

    my $cumulative = $opts{'cumulative'} == 1 ? 'Cumulative ' : '';
    my $type = '';
    if(length($opts{'base'})){
	$type = length($opts{'base'}) == 2 ? uc($opts{'base'}) : ucfirst($opts{'base'});
	$type .= ' ';
    }
    my $title = $cumulative . 'gene ' . $type . 'skew';
    $title .= ' (GC3)' if ($opts{'gc3'});

    if ($opts{'output'} eq 'g' || $opts{'output'} eq 'show'){
	mkdir ("graph", 0777);

	grapher(
		 \@location,\@skew,
		 -x=>"bp", -y=>$title,
		 -filename=>$opts{'filename'},
		 -title=>$title,
		 -style=>"lines", -type=>"columns",
		 -output=>$opts{'output'}
		);

    }elsif ($opts{'output'} eq 'f'){
	$opts{'filename'} =~ s/\.png$/\.csv/;
	my $j = 0;
	mkdir ("data", 0777);

	open(OUT, ">data/" . $opts{'filename'});
	print OUT "location,$title\n";
	for ($j = 0; $j <= scalar(@skew); $j++){
	    print OUT $location[$j], ",", $skew[$j], "\n";
	}
	close(OUT);
    }
    
    return @skew;
}



=head2 genomicskew

 Name: genomicskew   -   calculate the GC skew in different regions of the given genome

 Description:
   This program graphs the GC skew for the whole genome, coding regions,
   intergenic regions, and the third codon.

 Usage: 
   (\@gcskew, \@geneskew, \@betskew, \@thirdskew) = genomicskew($genome);

 Options:
   -divide      window number to divide into (default: 250)
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "genomicskew.png" for -output=>"g",
                                          "genomicskew.csv" for -output=>"f")
   -application application to open png image (default: "gimv")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20090313-01 updated according to the update of $gb->intergenic() to 
               include stable RNA genes as genetic elements
   20080421-01 now returns references of result arrays
   20040610-01 updated to handle introns and exons
   20040601-01 bug fix for output=>"f" option
   20010905-01 updated options
   20010727-01 initial posting

=cut


sub genomicskew {
    &opt_default(divide=>250, at=>0, output=>"show", application=>"gimv",
		  filename=>"genomicskew.png", intron=>0);
    my @args = opt_get(@_);

    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
	
    my $gb = opt_as_gb(shift @args);
    my $divide = opt_val("divide");
    my $opt = opt_val("output");
    my $application = opt_val("application");
    my $at = opt_val("at");
    my $intron = opt_val("intron");
    my (@gcskew, @betskew, @geneskew, @thirdskew);
    my @location = (0..$divide);
    my ($j, $window, $CDS, $BET, $THIRD);

    foreach my $cds ($gb->cds()){
	next if (length $gb->{$cds}->{join});
	my $seq .= $gb->get_gbkseq($gb->{$cds}->{start}, $gb->{$cds}->{end});
	$CDS .= $seq;
       
        for($j = 2; $j <= length($seq); $j += 3){
            if ($gb->{"$cds"}->{direction} eq 'complement'){
                $THIRD .= substr($seq, $j, 1);
            }else{
                $THIRD .= substr($seq, $j - 2, 1);
            }
        }
    }

    foreach my $cds ($gb->intergenic()){
	$BET .= $gb->get_geneseq($cds);
    }

    my $i = 0;
    
    $window = int(length($gb->{SEQ}) / $divide);
    while($i <= $divide){
	my $g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/g/g/;
	$g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($gb->{SEQ}, $window * $i, $window) =~ tr/c/c/;
	$c = substr($gb->{SEQ}, $window * $i, $window) =~ tr/t/t/ if ($at);
	$gcskew[$i] = 0;
	$gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($CDS) / $divide);
    while($i <= $divide){
	my $g = substr($CDS, $window * $i, $window) =~ tr/g/g/;
	$g = substr($CDS, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($CDS, $window * $i, $window) =~ tr/c/c/;
	$c = substr($CDS, $window * $i, $window) =~ tr/t/t/ if ($at);
	$geneskew[$i] = 0;
	$geneskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($BET) / $divide);
    while($i <= $divide){
	my $g = substr($BET, $window * $i, $window) =~ tr/g/g/;
	$g = substr($BET, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($BET, $window * $i, $window) =~ tr/c/c/;
	$c = substr($BET, $window * $i, $window) =~ tr/t/t/ if ($at);
	$betskew[$i] = 0;
	$betskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($THIRD) / $divide);
    while($i <= $divide){
	my $g = substr($THIRD, $window * $i, $window) =~ tr/g/g/;
	$g = substr($THIRD, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($THIRD, $window * $i, $window) =~ tr/c/c/;
	$c = substr($THIRD, $window * $i, $window) =~ tr/t/t/ if ($at);
	$thirdskew[$i] = 0;
	$thirdskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }

    my $title = "GC skew";
    $title = "AT skew" if ($at);
    
    if ($opt eq "show" || $opt eq "g"){
	mkdir ("graph", 0777);
	_UniMultiGrapher(
			 \@location, 
			 -x=>"bp", -y=>$title,
			 \@gcskew, -x1=>"whole genome",
			 \@geneskew, -x2=>"coding region",
			 \@betskew, -x3=>"intergenic region",
			 \@thirdskew, -x4=>"codon third position",
			 -style=>"lines", -type=>"columns",
			 -filename=>$filename,
			 -title=>$title
			 );

	msg_gimv("graph/" . $filename) if ($opt eq 'show');
    }elsif ($opt eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);
	open(OUT, ">data/" . $filename);
	print OUT "location,$title,coding,intergenic,third codon\n";
	for ($j = 0; $j <= $divide; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], ",", $geneskew[$j], ",",
	    $betskew[$j], ",", $thirdskew[$j], ",", "\n";
	}
	close(OUT);
    }
    
    return (\@gcskew, \@geneskew, \@betskew, \@thirdskew);
}


=head2 gcwin

 Name: gcwin   -   calculate the GC content along the given genome

 Description:
   This program calculates and graphs the GC content.

 Usage: 
    array @gcwin = gcwin(G instance);

 Options:
   -window      window size to observe (default: 10000)
   -at          1 when observing AT content instead of GC content (default: 0)
   -purine      1 when observing purines (AG) skew (default: 0)
   -keto        1 when observing ketos (TG) skew (default: 0)
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "gcwin.png" for -output=>"g",
                                          "gcwin.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20091007-01 changed to disregard N or X bases
   20070429-01 minor bug fix related to output=>"f" option
   20060711-01 added purine and keto options
   20010905-01 updated options
   20010729-01 initial posting

=cut




sub gcwin  {
    &opt_default(window=>10000, at=>0, purine=>0, keto=>0, output=>"show", 
		 application=>"gimv", filename=>"gcwin.png", skipGap=>0);
    my @args = opt_get(@_);

    my $gb = opt_as_gb(shift @args);
    my $ref = \$gb->{SEQ};
    my $window = opt_val("window");
    my $at = opt_val("at");
    my $purine = opt_val("purine");
    my $keto = opt_val("keto");
    my $skip = opt_val("skipGap");
    my $application = opt_val("application");
    my $filename = opt_val("filename");
    $filename =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $opt = opt_val("output");
    my (@gcwin, @location);
    my $j;
    
    my $i = 0;
    my ($g, $c, $rest);
    while(length($$ref) - ($window * $i) >= $window){
        if($at){
            $g = substr($$ref, $window * $i, $window) =~ tr/a/a/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	    $rest =  substr($$ref, $window * $i, $window) =~ tr/g/g/;
	    $rest += substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }elsif($purine){
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $rest =  substr($$ref, $window * $i, $window) =~ tr/t/t/;
	    $rest += substr($$ref, $window * $i, $window) =~ tr/c/c/;
        }elsif($keto){
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/t/t/;
	    $rest =  substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $rest += substr($$ref, $window * $i, $window) =~ tr/c/c/;
	}else{
            $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
            $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
	    $rest =  substr($$ref, $window * $i, $window) =~ tr/a/a/;
	    $rest += substr($$ref, $window * $i, $window) =~ tr/t/t/;
        }
	$rest += $g + $c;

	if($rest == 0 && $skip){
	    $gcwin[$i] = $gcwin[$i - 1];
	}else{
	    $gcwin[$i] = $rest >= 1 ? sprintf("%.6f",($g+$c)/$rest) : 0;
	}
	$location[$i] = $i * $window;

	$i ++;
    }
    $i --;

    my $title = "GC content";
    if ($at){
        $title = "AT content";
    }elsif($purine){
        $title = "Purine content";
    }elsif($keto){
        $title = "Keto content";
    }
        
    if ($opt eq 'g' || $opt eq 'show'){
	mkdir ("graph", 0777);
	_UniMultiGrapher(
			 \@location, \@gcwin,
			 -x=>"bp", -y=>$title, 
			 -filename=>$filename,
			 -title=>$title, -style=>"lines", -type=>"columns"
			 );
	msg_gimv("graph/" . $filename)
	    if ($opt eq 'show');;
    }elsif ($opt eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);
	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcwin[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcwin;
}


=head2 coding_density

 Name: coding_density   -   calculate and plot the coding density along the given genome

 Description:
   This program calculates and graphs the coding density along the given genome.
   To obtain the overall genomic coding density (percentage of nucleotides in ORFs), 
   run the function in scalar context. 

 Usage: 
    array  @coding_density = coding_density(G instance); #list of windowed values
    scalar $coding_density = coding_density(G instance); #percentage in a genome

 Options:
   -window      window size to observe (default: 10000)
   -gene        plot gene counts instead of coding density
   -cumulative  plot cumulative graph
   -output      f for file output in directory "data", 
                g for graph output in directory "graph",
                show for graph output and display (default: "show")
   -filename    output filename (default: "coding_density.png" for -output=>"g",
                                          "coding_density.csv" for -output=>"f")

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20120619-01 initial posting

=cut




sub coding_density  {
    &opt_default(window=>10000, output=>"show", filename=>"coding_density.png", gene=>0, cumulative=>0);
    my @args = opt_get(@_);

    my $gb         = opt_as_gb(shift @args);
    my $window     = opt_val("window");
    my $filename   = opt_val("filename");
    my $gene       = opt_val("gene");
    my $cumulative = opt_val("cumulative");
    $filename      =~ s/\.png$/\.csv/ if (opt_val("output") eq 'f');
    my $opt        = opt_val("output");

    my (@pos, @num);
    my $j;

    for my $cds ($gb->cds()){
	if(length($gb->{$cds}->{join})){
	    for my $segment (split(/,/, $gb->{$cds}->{join})){
		$segment =~ s/c//g;
		my ($start, $end) = split(/\.\./, $segment, 2);
		substr($gb->{SEQ}, $start - 1, $end - $start + 1) = "P" x ($end - $start + 1);
	    }
	}else{
	    substr($gb->{SEQ}, $gb->{$cds}->{start} - 1, $gb->{$cds}->{end} - $gb->{$cds}->{start} + 1) = 
		"P" x ($gb->{$cds}->{end} - $gb->{$cds}->{start} + 1);
	}
    }

    if($gene){
	for (my $i = 0; $i + $window <= length($gb->{SEQ}); $i += $window){
	    push(@pos, $i);
	    
	    my $j = 0;
	    for my $cds ($gb->cds()){
		$j ++ if ($i < $gb->{$cds}->{start} && $gb->{$cds}->{start} <= $i + $window);
	    }
	    push(@num, $j);
	}
    }else{
	for (my $i = 0; $i + $window <= length($gb->{SEQ}); $i += $window){
	    my $seq = substr($gb->{SEQ}, $i, $window);
	    my $P = $seq =~ tr/P/P/;
	    push(@pos, $i);
	    push(@num, $P/$window);
	}
    }

    @num = cumulative(@num, -mean=>1) if($cumulative);

    my $title = "coding density";
    $title = 'cumulative ' . $title;
    $title .= ' (gene count)' if ($gene);

    if ($opt eq 'g' || $opt eq 'show'){
	mkdir ("graph", 0777);
	grapher(
		\@pos, \@num,
		-x=>"bp", -y=>$title, 
		-filename=>$filename,
		-title=>$title, -style=>"lines", -type=>"columns",
		-output=>$opt
		);
    }elsif ($opt eq 'f'){
	my $j = 0;
	mkdir ("data", 0777);
	open(OUT, ">data/" . $filename);
	print OUT "location,$title\n";
	for ($j = 0; $j <= $#pos; $j++){
	    print OUT $pos[$j], ",", $num[$j], "\n";
	}
	close(OUT);
    }
    
    if(wantarray()){
	return @num;
    }else{
	my $n = $gb->{SEQ} =~ tr/P/P/;
	return ($n / length($gb->{SEQ}));
    }
}



=head2 query_strand

  Name: query_strand   -   get the strand name (leading or lagging) from the given position

  Description:
    Given a position and strand information (direct or complement), 
    returns whether the specified position is in the leading or lagging strand.

  Usage: 
    string strand = query_strand(G instance, int position);

    or

    string strand = query_strand(G instance, CDS/FEATURE id);

 Options: 
    -direction    strand of the querying position, either 'direct' or 'complement'
                  (default: direct)

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070106-01 added CDS option
    20020218-01 initial posting

=cut




sub query_strand {
    opt_default(direction=>'direct');
    my @args = opt_get(@_);
    my $gb = shift @args;
    my $pos = shift @args;
    my $direction = opt_val("direction");

    if($pos =~ /^FEATURE/ || /^CDS/){
	$direction = $gb->{$pos}->{direction};
	$pos = $gb->{$pos}->{start};
    }

    my ($ori, $ter) = rep_ori_ter($gb);

    if ($ori > $ter){
	if ($pos < $ter || $pos > $ori){
	    if ($direction eq 'complement'){
		return ("lagging");
	    }else{
		return ("leading");
	    }
	}else{
	    if ($direction eq 'complement'){
		return ("leading");
	    }else{
		return ("lagging");
	    }
	}
    }else{
	if ($pos > $ori && $pos < $ter){
	    if ($direction eq 'complement'){
		return ("lagging");
	    }else{
		return ("leading");
	    }
	}else{
	    if ($direction eq 'complement'){
		return ("leading");
	    }else{
		return ("lagging");
	    }
	}
    }
}






=head2 query_arm

  Name: query_arm   -   get the replication arm name (left or right) from the given position

  Description:
    Given a position, returns whether the specified position is in the 
    left or right arm of the circular chromosome.

  Usage: 
    string arm = query_arm(G instance, int position);

  Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070112-01 initial posting

=cut



sub query_arm{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $pos = shift @args;

    if($pos =~ /^FEATURE/ || /^CDS/){
	$pos = $gb->{$pos}->{start};
    }
    
    my ($ori, $ter) = rep_ori_ter($gb);

    if($ori < $ter){
	if($pos <= $ori){
	    return 'left';
	}elsif($pos >= $ori && $pos <= $ter){
	    return 'right';
	}elsif($pos >= $ter){
	    return 'left';
	}
    }else{
	if($pos < $ter){
	    return 'right';
	}elsif($pos >= $ter && $pos <= $ori){
	    return 'left';
	}elsif($pos >= $ori){
	    return 'right';
	}
    }
}


=head2 set_strand 

  Name: set_strand   -   set replication strand and arm information to given G instance

  Description:
    Sets $gb->{$cds}->{strand} and $gb->{$cds}->{arm} using
    query_strand() and query_arm(), indicating in which strand
    or replication arm the gene resides.

  Usage: 
    1 = set_strand($gb)

 Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070112-01 initial posting

=cut

sub set_strand{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    foreach my $cds ($gb->cds()){
	$gb->{$cds}->{strand} = query_strand($gb, $cds);
	$gb->{$cds}->{arm} = query_arm($gb, $cds);
    }

    return 1;
}




=head2 set_gc3

  Name: set_gc3   -   set GC content in 3rd codon position of all genes

  Description:
    Sets $gb->{$cds}->{gc3}, GC content in 3rd codon position.
    Value is in decimal (eg. 0.56345).

  Usage: 
    1 = set_gc3($gb)

 Options: 
    None.

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070116-01 initial posting

=cut



sub set_gc3{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    foreach my $cds ($gb->cds()){
	my $geneseq = $gb->get_geneseq($cds);

	my (%gc3, $tot);

	my $i = 0;
	for($i = 2; $i < length $geneseq; $i += 3){
	    $gc3{substr($geneseq, $i, 1)}++;
	    $tot ++;
	}

	$gb->{$cds}->{gc3} = ($gc3{g} + $gc3{c})/$tot;
    }

    return 1;
}





=head2 genes_from_ori

 Name: genes_from_ori   -   get a list of CDS IDs ordered in the distance from origin of replication

 Description:
   This program lists genes in order relative to the position of 
   replication origin in either right or left half of the bacterial
   chromosomes.

  Usage: array @genes = genes_from_ori(G instance, "right");

 Options:
   Second argument should be eighter "right" or "left" to indicate
   the interested half of the bacterial chromosome. If omitted, 
   returns list of genes on both arms in the order of distance
   from replication origin.

 Note:
   Origin and terminus of replication is obtained from rep_ori_ter()

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070106-01 initial posting
   20070112-01 modified the handling of second argument
=cut



sub genes_from_ori{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $wing = shift @args;

    my ($ori, $ter) = rep_ori_ter($gb);

    my (@sectionA, @sectionB, @sectionC, @left, @right);
    
    if($ori < $ter){
	foreach my $cds ($gb->cds()){

	    if($gb->{$cds}->{start} <= $ori){
		push(@sectionA, $cds);
	    }elsif($gb->{$cds}->{start} >= $ori && $gb->{$cds}->{start} <= $ter){
		push(@sectionB, $cds);
	    }elsif($gb->{$cds}->{start} >= $ter){
		push(@sectionC, $cds);
	    }else{
		warn("Something is wrong at G::Seq::GCskew::genes_from_ori()");
	    }
	}

	@left = (reverse(@sectionA), reverse(@sectionC));
	@right = @sectionB;

    }else{
	foreach my $cds ($gb->cds()){

	    if($gb->{$cds}->{start} < $ter){
		push(@sectionA, $cds);
	    }elsif($gb->{$cds}->{start} >= $ter && $gb->{$cds}->{start} <= $ori){
		push(@sectionB, $cds);
	    }elsif($gb->{$cds}->{start} >= $ori){
		push(@sectionC, $cds);
	    }else{
		warn("Something is wrong at G::Seq::GCskew::genes_from_ori()");
	    }
	}
	
	@left = (reverse(@sectionB));
	@right = (@sectionC, @sectionA);
	
    }

    if(lc($wing) =~ /l/){
	return @left;
    }elsif(lc($wing) =~ /r/){
	return @right;
    }else{
	my %hash;
	foreach my $cds ($gb->cds()){
	    $hash{$cds} = dist_in_cc($gb, $gb->{$cds}->{start});
	}

	my @all = sort{ $hash{$a} <=> $hash{$b}} keys %hash;
	return @all;
    }
}




=head2 dist_in_cc

 Name: dist_in_cc   -   calculates the distance between two loci in circular chromosomes

 Description:
   This program calculates the distance between two loci in 
   circular chromosomes, mostly useful to calculate the
   distance from the replication origin.

  Usage: int distance = dist_in_cc(G instance, int position1, int position2);

 Options:
   If the second position is not given, position of replication origin is used.

 Note:
   Origin and terminus of replication is obtained from rep_ori_ter()

 Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

 History:
   20070112-01 initial posting
=cut



sub dist_in_cc{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $first = shift @args;
    my $second = shift @args;

    $first = $gb->{$first}->{start} if($first =~ /^FEATURE/ || /^CDS/);
    $second = $gb->{$second}->{start} if($second =~ /^FEATURE/ || /^CDS/);

    unless(length($second)){
	my ($ori, $ter) = rep_ori_ter($gb);
	$second = $ori;
    }

    my @dist;

    $dist[0] = abs($first - $second);
    $dist[1] = abs($first + length($gb->{SEQ}) - $second);
    $dist[2] = abs($first - (length($gb->{SEQ}) + $second));
    
    my @new = sort {$a <=> $b} @dist;

    return shift @new;
}




=head2 lda_bias

  Name: lda_bias   -   calculate strand bias of bacterial genome using linear discriminant analysis (LDA)

  Description:
    This method calculate strand bias of bacterial genome using linear discriminant 
    analysis (LDA), as proposed in Reference 1. The basic idea is to use composition
    data of genes to train and predict the strand of genes residing either on the 
    leading or the lagging strand. For computational efficiency, this method trans
    and predicts the strands at putative replication origin as reported by the 
    rep_ori_ter() method. This usually results in maximum predictability of LDA
    within bacterial genomes.

    Data to use for LDA can be chosen from "base", "codonbase", "codon", and "amino",
    with -variable option.

    Installation of R statistics package is required.

 Usage: 
    $bias = lda_bias($genome);

 References:
   1. Rocha EPC et al. (1999) "Universal replication biases in bacteria",
      Molecular Microbiology, 32(1):11-16

  Options:
   -variable        data to use for LDA. Either "base", "codonbase", "codon", or "amino".
                    (default: codon)
   -coefficients    show LDA coefficients (default: 0)

  Author: Kazuharu Arakawa

  History:
    20110223-01 initial posting

=cut



sub lda_bias{
    &opt_default('variable'=>'codon', 'coefficients'=>0);
    my @args = opt_get(@_);
    my $gb   = opt_as_gb(shift @args);
    my %data = opt_val();

    my $fh = File::Temp->new();
    my $file = $fh->filename;

    if($data{'variable'} eq 'base'){
	print $fh "strand,a,t,g,c\n";
	for my $cds ($gb->cds()){
	    my $seq = $gb->get_geneseq($cds);
	    print $fh join(',', query_strand($gb, $cds), map{$_/length($seq)} seqinfo($seq, -output=>"null")), "\n";
	}

    }elsif($data{'variable'} eq 'codonbase'){
	print $fh "strand,1a,1t,1g,1c,2a,2t,2g,2c,3a,3t,3g,3c\n";

	for my $cds ($gb->cds()){
	    my @pos;
	    my $seq = $gb->get_geneseq($cds);
	    for (my $i = 0; $i < length($seq); $i ++){
		$pos[$i % 3] .= substr($seq, $i, 1);
	    }

	    print $fh join(',', query_strand($gb, $cds), 
			   (map{$_/length($pos[0])} seqinfo($pos[0], -output=>"null")), 
			   (map{$_/length($pos[1])} seqinfo($pos[1], -output=>"null")), 
			   (map{$_/length($pos[2])} seqinfo($pos[2], -output=>"null"))
			   ), "\n";
	}

    }elsif($data{'variable'} eq 'codon'){
	my $genomic = codon_compiler($gb, -output=>"null", -data=>'R0');
	for (keys %$genomic){
	    delete($$genomic{$_}) if ($$genomic{$_} < scalar($gb->cds())/2);
	}
	print $fh join(',', 'strand', sort keys %$genomic), "\n";

	for my $cds ($gb->cds()){
	    my @res;
	    my $codon = codon_compiler($gb, -output=>"null", -data=>'R1', -id=>$cds);
	    for my $key (sort keys %$genomic){
		push(@res, $$codon{$key} || 0);
	    }
	    print $fh join(',', query_strand($gb, $cds), @res), "\n";
	}

    }elsif($data{'variable'} eq 'amino'){
	my $genomic = codon_compiler($gb, -output=>"null", -data=>'A0');
	for (keys %$genomic){
	    delete($$genomic{$_}) if ($$genomic{$_} < scalar($gb->cds())/2);
	}
	print $fh join(',', 'strand', sort keys %$genomic), "\n";

	for my $cds ($gb->cds()){
	    my @res;
	    my $codon = codon_compiler($gb, -output=>"null", -data=>'A1', -id=>$cds);
	    for my $key (sort keys %$genomic){
		push(@res, $$codon{$key} || 0);
	    }
	    print $fh join(',', query_strand($gb, $cds), @res), "\n";
	}

    }else{
	die("lda_bias: Bad parameter for -variable option. Choose from 'base', 'codonbase', 'codon', and 'amino'.");
    }

#    my $g = $data{'coefficients'} ? 'g' : '';
    my $r = new Rcmd;
    $r->set_mode('silent');

    my $ret =  $r->exec(
			qq|
			X = read.csv('$file', header=TRUE);
			library(MASS);
			g = lda(strand~.,data=X);
			g;
			v = predict(g,X);
			p = sum(v\$class == X[,1])/length(X[,1]);
			p;
			|
			);

    msg_error(readFile($r->{log})) if($data{'coefficients'});
    return $ret;
}






=head2 B1

  Name: B1   -   calculate strand bias of bacterial genome using B1 index

  Description:
    This method calculate strand bias of bacterial genome using B1 index,
    first proposed by Lobry and Sueoka in Reference 1, and further extended by 
    Rocha et al. in Reference 2. Basic idea of B1 index is to calculate the 
    distance between the two strands, when the leading and lagging strands
    are plotted in a coordinate system with axes representing G/(G+C) and A/(A+T),
    using the third codon position of genes. This index measures the degree of 
    replication-induced bias from Chargaff's second parity rule.

    Rocha et al. modified B1 index to only use >fourfold degenerate codons,
    and to use T/(A+T) in place of A/(A+T).

 Usage: 
    $bias = B1($genome);

 References:
   1. Lobry JR and Sueoka N (2002) "Asymmetric directional mutation pressures in bacteria",
      Genome Biology, 3(10):0058
   2. Rocha EPC et al. (2006) "Similar compositional biases are caused by very different
      mutational effects", Genome Research, 16:1537-1547

  Options:
   -method         choose the method of 'lobry' or 'rocha' (default: 'rocha')

  Author: Kazuharu Arakawa

  History:
    20110223-01 initial posting

=cut



sub B1{
    &opt_default('method'=>'rocha');
    my @args   = opt_get(@_);
    my $gb     = opt_as_gb(shift @args);
    my %option = opt_val();

    my $data = {};

    for my $cds ($gb->cds()){
	my $seq    = $gb->get_geneseq($cds);
	my $strand = query_strand($gb, $cds);

	for (my $i = 0; $i < length($seq); $i +=3){
	    if($option{'method'} eq 'lobry'){
		$data->{$strand}->{substr($seq, $i+2, 1)}++;
	    }else{
		my $codon = substr($seq, $i, 2);
		if($codon eq 'gc' || $codon eq 'gg' || 
		   $codon eq 'cc' || $codon eq 'ac' || $codon eq 'gt'  
		   || $codon eq 'ct' || $codon eq 'tc' || $codon eq 'cg'
		   ){

		    $data->{$strand}->{substr($seq, $i+2, 1)}++;		    
		}
	    }
	}
    }

    my $nuc = $option{'method'} eq 'lobry' ? 'a' : 't';
    
    return sqrt( 
		   ($data->{'leading'}->{'g'} /($data->{'leading'}->{'g'}+$data->{'leading'}->{'c'}) 
		  - $data->{'lagging'}->{'g'} /($data->{'lagging'}->{'g'}+$data->{'lagging'}->{'c'})) ** 2 +
		   ($data->{'leading'}->{$nuc}/($data->{'leading'}->{'a'}+$data->{'leading'}->{'t'}) 
		  - $data->{'lagging'}->{$nuc}/($data->{'lagging'}->{'a'}+$data->{'lagging'}->{'t'})) ** 2 );
}


=head2 B2

  Name: B2   -   calculate strand bias of bacterial genome using B2 index

  Description:
    This method calculate strand bias of bacterial genome using B2 index,
    proposed by Lobry and Sueoka in Reference 1. Basic idea of B2 index is 
    to calculate the distance from neutral parity state (0.5, 0.5), when the bias of
    the coding regions is plotted in a coordinate system with axes representing 
    G/(G+C) and A/(A+T), using the third codon position of genes. This index measures 
    the degree of transcription- and translation-associated effects of bias from 
    Chargaff's second parity rule.

 Usage: 
    $bias = B2($genome);

 References:
   1. Lobry JR and Sueoka N (2002) "Asymmetric directional mutation pressures in bacteria",
      Genome Biology, 3(10):0058

  Options:
    None.

  Author: Kazuharu Arakawa

  History:
    20110223-01 initial posting

=cut


sub B2{
    &opt_default();
    my @args   = opt_get(@_);
    my $gb     = opt_as_gb(shift @args);
    my %option = opt_val();
    my %data;

    for my $cds ($gb->cds()){
	my $seq    = $gb->get_geneseq($cds);

	for (my $i = 0; $i < length($seq); $i +=3){
	    $data{substr($seq, $i+2, 1)}++;
	}
    }

    return sqrt( ($data{'g'}/($data{'g'}+$data{'c'}) - 0.5) ** 2 + ($data{'a'}/($data{'a'}+$data{'t'}) - 0.5) ** 2 );
}





=head2 delta_gcskew

  Name: delta_gcskew   -   calculate strand bias of bacterial genome using delta GC skew index

  Description:
    This method calculate strand bias of bacterial genome using delta GC skew index,
    first proposed by Rocha et al in Reference 1, and further extended  
    in Reference 2. Basic idea of delta GC skew index is to calculate the 
    difference of GC skew in coding regions residing in leading and lagging strands.
    Reference 1 calculates delta GC skew index using the third codon position of genes,
    and Reference 2  modified to only use >fourfold degenerate codons.

 Usage: 
    $bias = delta_gcskew($genome);

 References:
   1. Rocha EPC et al. (2001) "Ongoing Evolution of Strand Composition in Bacterial Genomes",
      Molecular Biology and Evolution, 18(9):1789-1799
   2. Rocha EPC et al. (2006) "Similar compositional biases are caused by very different
      mutational effects", Genome Research, 16:1537-1547

  Options:
   -method      choose the nucleotides to use 'degenerate', 'gc3', or 'all' (default: 'degenerate')
   -at          1 when observing AT skew instead of GC skew (default: 0)
   -purine      1 when observing purine (AG/TC) skew (default: 0)
   -keto        1 when observing keto (TG/AC) skew (default: 0)

  Author: Kazuharu Arakawa

  History:
    20110223-01 initial posting

=cut



sub delta_gcskew{
    &opt_default('method'=>'degenerate');
    my @args   = opt_get(@_);
    my $gb     = opt_as_gb(shift @args);
    my %option = opt_val();

    my %data;

    for my $cds ($gb->cds()){
	my $seq    = $gb->get_geneseq($cds);
	my $strand = query_strand($gb, $cds);

	if($option{'method'} eq 'all'){
	    $data{$strand} .= $seq;
	}elsif($option{'method'} eq 'gc3'){
	    for (my $i = 0; $i < length($seq); $i +=3){
		$data{$strand} .= substr($seq, $i+2, 1);
	    }
	}else{
	    for (my $i = 0; $i < length($seq); $i +=3){
		my $codon = substr($seq, $i, 2);
		if($codon eq 'gc' || $codon eq 'gg' || 
		   $codon eq 'cc' || $codon eq 'ac' || $codon eq 'gt'  
		   || $codon eq 'ct' || $codon eq 'tc' || $codon eq 'cg'
		   ){
			$data{$strand} .= substr($seq, $i+2, 1);
		    }
	    }
	}
    }

    my %send;
    for my $key (keys %option){
	$send{'-' . $key} = $option{$key};
    }

    my @lead = gcskew($data{'leading'}, %send, -output=>"null", -window=>length($data{'leading'}));
    my @lagg = gcskew($data{'lagging'}, %send, -output=>"null", -window=>length($data{'lagging'}));

    return $lead[0] - $lagg[0];
}







1;
