#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Util.pm,v 1.1.1.1 2002/04/02 20:25:43 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package G::Seq::Util;

use SubOpt;
use G::Messenger;
use G::Seq::Primitive;
use G::Tools::Graph;
use G::DB::SDB;
use G::IO;
use G::IO::Annotation;

use strict;
use base qw(Exporter);
use autouse 'Cwd'=>qw(getcwd);
use SelfLoader;
use File::ShareDir ':ALL';

our @EXPORT = qw(
		 seqinfo
		 longest_ORF
		 molecular_weight
		 view_cds
		 generate_oligomers
		 oligomer_translation
		 filter_cds_by_atg
		 find_king_of_gene
		 refseq2gbk
		 );


#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

  G::Seq::Util - Miscellaneous analysis methods related to sequence analysis.

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting miscellaneous sequence analysis methods.

=cut

__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::

=head2 generate_oligomers

  Name: generate_oligomers   -   generate all oligomers of given length

  Description:
    This method retuns a list of all oligomers of given length. 
    For example, for length 2, this method returns a list of all 
    di-nucleotides (aa,at,ag,ac,ta,tt,tg,tc,ga,gt,gg,gc,ca,ct,cg,cc),
    and for length 8, returns all 65536 octamers.

 Usage: 
    @oligomers = generate_oligomers($length)

 Options:
    None.

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20100114-01 initial posting

=cut

sub generate_oligomers{
    my $num = shift || 8;

    local *transferase = sub{
	my @result;
	foreach my $key (@_){
	    push(@result, $key . $_) for (qw/a t g c/);
	}
	return @result;
    };

    my @result = ('');
    for (1..$num){
	@result = &transferase(@result);
    }
    return @result;
}


=head2 seqinfo

  Name: seqinfo   -   prints out basic nucleotide sequence statistics

  Description:
    This method prints out basic compositional statistics of the 
    given nucleotide sequence, in a format similar to the one printed
    right after calling load(). Returns the number of A, T, G, and C bases.

 Usage: 
    ($a, $t, $g, $c) = seqinfo($genome)

 Options:
    Supply something as the second argument to suppress standard output.
    ex. ($a, $t, $g, $c) = seqinfo($genome, 1)
    This is useful for simple counting of the four nucleotides.

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20100108-01 added the second argument suppresing output.
    20080428-01 integrated into G::IO::Handler::seq_info()
    20020207-01 initial posting

=cut


sub seqinfo {
    return opt_as_gb(shift @_)->seq_info(shift @_);
}


=head2 longest_ORF

  Name: longest_ORF   -   find longest ORFs within a given sequence

  Description:
    This method searches for putative coding regions by longest ORF
    method, the longest spanning sequence seqment starting from "atg"
    and ending with "tag", "taa", or "tgg" in the same coding frame.

    Returning object is the G-language genome data object, so in 
    Shell, typing the following after calling this method shows the
    list of putative ORFs:
         $annotated_data->find(-type=>"CDS");

 Usage: 
    $genome = longest_ORF($sequence);

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070914-01 rewrite, and moved from G::Seq::ORF
    20010829-01 initial posting

=cut


sub longest_ORF{
    my $this = opt_as_gb(shift @_);
    my $new = new G::IO("blessed");
    annotate_with_LORF($new, $this);

    return $new;
}

=head2 molecular_weight

  Name: molecular_weight   -   calculates the molecular weight of given sequence or molecule

  Description:
    This method calculates the molecular weight of the given sequence, either ssDNA, dsDNA, RNA, Peptide,
    or that of molecular compounds from chemical formula.

    For RNA, 
      M.W. = (An x 329.2) + (Un x 306.2) + (Cn x 305.2) + (Gn x 345.2) + 159
    For ssDNA,
      M.W. = (An x 313.2) + (Tn x 304.2) + (Cn x 289.2) + (Gn x 329.2) + 79.0
    For dsDNA,
      M.W. of dsDNA = (nucleotides x 607.4) + 157.9

    For peptides and chemical formula, mass is calculated by the cumulative addition of each 
    amino acid or atoms. For example, 

      molecular_weight("C6H12O6") #glucose

    returns 180.0633881166

 Usage: 
    $mass = molecular_weight($sequence);

 Options:
    -type        automatically guessed, but can be optionally specified
                 to be "ssDNA" (or just "DNA"), "dsDNA", "RNA", "peptide", or "molecule"

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20130130-01 major update to include DNA, RNA, peptide, and molecules in general
    20011029-01 initial posting

=cut

sub molecular_weight {
    my @args   = opt_get(@_);
    my $seq    = shift @args;
    my $strand = opt_val("strand");
    my $type   = lc(opt_val("type"));

    unless(length($type)){
	if($seq =~ /[0-9]/){
	    $type = "molecule";
	}elsif(lc($seq) =~ /^[atgcn]+$/){
	    $type = "ssdna";
	}elsif(lc($seq) =~ /^[augcn]+$/){
	    $type = "rna";
	}else{
	    $type = "peptide";
	}
    }
    $type = "ssdna" if($type eq 'dna');

    if($type eq "molecule"){
	my %atom;
	for my $line (readFile(dist_file('g-language', 'data/atom.csv')), 1){
	    my ($atomm, $mass) = split(/,/, $line,2);
	    $atom{$atomm} = $mass;
	}

	my $res;
	while($seq =~ /([A-Z][a-z]*)(\d*)/g){
	    my $prod = $2 || 1;
	    $res += $atom{$1} * $prod;
	}
	return $res;
    }else{
	my (%count, $res);
	$count{$_}++ for (split(//, lc($seq)));

	if($type eq "peptide"){
	    require G::Seq::AminoAcid;
	    my %mass = G::Seq::AminoAcid::mass();

	    for my $key (keys %count){
		$res += $mass{uc($key)} * $count{$key};
	    }

	    return $res;
	}elsif($type eq "ssdna"){
	    return $count{'a'} * 313.2 + $count{'u'} * 304.2 + $count{'t'} * 304.2 + $count{'c'}*289.2 + $count{'g'}*329.2 + $count{'n'} * 303.7 + 79;
	}elsif($type eq "rna"){
	    return $count{'a'} * 329.2 + $count{'u'} * 306.2 + $count{'t'} * 306.2 + $count{'c'}*305.2 + $count{'g'}*345.2 + $count{'n'} * 320.5 + 159;
	}elsif($type eq "dsdna"){
	    return length($seq) * 607.4 + 157.9;
	}else{
	    die("Unknown type $type in molecular_weight()");
	}
    }
}


#find_king_of_gene ver.20010608-01
#scripting by Koya Mori(s98982km@sfc.keio.ac.jp)
#This program finds king of gene.
#have fun:)
#(string)=&find_king_of_gene(pointer GENOME,  boolean debug);

sub find_king_of_gene{
    my $nuc=shift;
    my $gene='you have just found the king of genes.'."\n";
    
    system("wget http://www.stagnightout.com/pics/what-to-wear/21280.jpg -O /tmp/afro.jpg -q");
    msg_gimv('/tmp/afro.jpg');
    
    return $gene;
}




# oligomer_translation ver.20011103-01
# Author: Kazuharu Arakawa
# Usage: (string translation) = &_oligomer_translation(string oligomer,
#         int frame);
# Options:
#   none
# Description:
#   This method returns the mixture of translated amino acid sequence
#   with untranslated nucleotide sequence of an oligomer of given
#   reading frame.

sub oligomer_translation {
    my @args = opt_get(@_);
    my $seq = shift @args;
    my $frame = shift @args;
    my $len = length($seq);
    if ($frame > 3){
	$seq = G::Seq::Util::complement($seq);
	$frame -= 3;
    }

    my %CodonTable = (
               'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L',
               'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L',
               'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I',
               'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I',
               'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I',
               'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P',
               'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P',
               'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P',
               'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P',
               'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M',
               'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W',
               'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C',
               'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C',
               'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/',
               'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/',
               'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/'
                  );

    my $return = '';
    my $i;
    for ($i = 0; $i < $len; $i ++){
	if ($i < $frame - 1){
	    $return .= substr($seq, $i, $frame - 1) . '-';
	    $i += $frame - 2;
	} elsif ($i + 3 <= $len){
	    $return .= $CodonTable{substr($seq, $i, 3)};
	    $i += 2;
	    $return .= '-' unless ($i >= $len - 1);
	} else {
	    $return .= substr($seq, $i);
	    last;
	}
    }
    return $return;
}




=head2 view_cds

  Name: view_cds   -   displays a graph of nucleotide contents around start and stop codons

  Description:
    This method creates a graph showing the average A,T,G,C contents
    around start/stop codons. This is useful to view consensus around
    start/stop codons and to find characteristic pattern in CDS. 
    
  Usage : 
    view_cds(G instance);

  Options:
    -length    length in bases to show around start/stop codons
               (default: 100)
    -gap       gap shown in graph in between start/stop codon neighbors
               (default: 3)
    -filename  outfile name   (default: view_cds.png for graph, 
               view_cds.csv for file)
    -output    "f" for file, "g" for graph, "show" to display graph. 
               (default: "show")

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070914-01 code optimization
    20070707-01 moved to G::Seq::Util from G::Seq::GCskew
    20010906-01 initial posting

=cut



sub view_cds{
    opt_default(length=>100, filename=>"view_cds.png", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $data = {};
    my @pos;
    my $numcds = scalar $gb->cds();
    my $i = 0;
    my $gap = opt_val("gap");
    my $length = opt_val("length");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $application = opt_val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt_val("filename") eq "view_cds.png");
    for my $nuc (qw/a t g c/){
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	    $data->{$nuc}->[i] = 0;
	}
    }

    foreach my $cds ($gb->cds()){
	my $seq = $gb->around_startcodon($cds, $length, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    $data->{substr($seq, $i, 1)}->[$i] += 100/$numcds;
	}

	$seq  = $gb->around_stopcodon($cds, $length, $length);

	for ($i = 0; $i < length($seq); $i ++){
	    $data->{substr($seq, $i, 1)}->[$i + length($seq) + $gap] += 100/$numcds;
	}
    }
    
    for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){
	push(@pos, $i);
    }

    if ($output eq "g" || $output eq "show"){
	_UniMultiGrapher(
			 \@pos, -x => "position", -y => "percentage",
			 [@{$data->{a}}], [@{$data->{t}}], [@{$data->{g}}], [@{$data->{c}}],
			 -x1=>"A", -x2=>"T",
			 -x3=>"G", -x4=>"C",
			 -filename => $filename,
			 -title => "Base Contents Around Start/Stop Codons"
			 );
	msg_gimv("graph/$filename") if($output eq "show");
    }elsif ($output eq "f"){
	open(OUT, '>data/' . $filename);
	print OUT "position,A,T,G,C\n";
	
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){
	    printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, 
	    $data->{a}[$i], $data->{t}[$i], $data->{g}[$i], $data->{c}[$i];
	}
	close(OUT);
    }
}





=head2 filter_cds_by_atg

  Name: filter_cds_by_atg   -   filter CDS by the presence of ATG near start codon

  Description:
    This method filters out the list of CDSs in a given genome, when
    ATG is present near the start codon within the same coding frame.
    Returns the list of CDSs that do not have nearby ATG, and sets
    $genome->{$cds}->{on} to 0 for all CDSs that contain nearby ATG.

 Usage: 
    @list_of_feature_ids = filter_cds_by_atg($gb)

 Options:
   -upstream      length of upstream region (default:50)
   -downstream    length of downstream region (default:50)
   -codon         start codon to search (default: "atg")

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
          originally by Koya Mori, but re-written.

  History:
    20070914-01 rewrite by Kazuharu Arakawa. renamed and moved to G::Seq::Util
    20010623-01 initial posting by Koya Mori as "eliminate_atg"

=cut


sub filter_cds_by_atg{
    opt_default(upstream=>50, downstream=>50, codon=>"atg");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    my $upstream = int(opt_val("upstream") / 3);
    my $downstream = int(opt_val("downstream") / 3);
    my $codon = opt_val("codon");
    die ("codon must be 3 letters") if (length($codon) != 3);

    foreach my $cds ($gb->cds()){
	my $seq = $gb->around_startcodon($cds, $upstream * 3, $downstream * 3, "without");

	for(my $j = 0; $j <= length($seq) - 3; $j +=3){
	    if(substr($seq, $j, 3) eq $codon){
		$gb->{$cds}->{on} = 0;
		last;
	    }
	}
    }

    return $gb->cds();
}



=head2 refseq2gbk

  Name: refseq2gbk   -   convert RefSeq and GenBank IDs of prokaryotes 

  Description:
    This method converts the RefSeq and GenBank IDs of prokaryotic genomes
    based on the list provided by NCBI at
    ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt
    Version suffix is ignores. For example, NC_000913.1 is converted to U00096.

 Usage: 
    $newid = refseq2gbk($id);

 Options:
    None.

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20140512-01 initial posting.

=cut



sub refseq2gbk{
    my $id = shift;

    $id =~ s/\.\d+//g;

    require LWP::Simple;

    my $url = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt';
    my $dir = _sdb_path() . '/data/prokaryote.txt';
    my $rc = LWP::Simple::mirror($url, $dir);
    die("refseq2gbk: cannot retrieve data from NCBI.") unless(-e $dir);

    my %refseq2gbk;
    for my $line (readFile($dir)){
	my @F = split(/\t/, $line);

	$F[8] =~ s/-//g;
	$F[9] =~ s/-//g;

	$F[8] =~ s/\.\d+//g;
	$F[9] =~ s/\.\d+//g;

	if($F[8] =~ /,/ || $F[9] =~ /,/){
	    
	    my $i = 0;
	    my @right = split(/,/, $F[9]);
	    for my $tmp (split(/,/, $F[8])){
		$refseq2gbk{$tmp} = $right[$i];
		$refseq2gbk{$right[$i]} = $tmp;
		$i ++;
	    }
	}else{
	    $refseq2gbk{$F[8]} = $F[9];
	    $refseq2gbk{$F[9]} = $F[8];
	}
    }

#    p %refseq2gbk;

    return $refseq2gbk{$id};
}


1;
