#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Align.pm,v 1.2 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::AminoAcid;

use SubOpt;
use G::Messenger;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
		 calc_pI
		 amino_info
		 one2three
		 three2one
);

__DATA__

#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::

=head1 NAME

    G::Seq::AminoAcid - Analysis methods related to amino acids.

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to amio acids.

=cut

#:::::::::::::::::::::::::::::::::
#       Let the code begin...
#:::::::::::::::::::::::::::::::::



=head2 G::Seq::AminoAcid::charge

  Name: G::Seq::AminoAcid::charge   -   returns the charge of a given amino acid

  Description:
    This method returns the charge of a given amino acid.

  Usage: 
    $charge = G::Seq::AminoAcid::charge($aminocid);
    eg. G::Seq::AminoAcid::charge('A'); # returns 0

  Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010404-01 initial posting

=cut

sub charge {
    my $key = shift;

    my %charge = (
		  A=>0, B=>-0.5, C=>0, D=>-1, E=>-1,  F=>0,
		  G=>0, H=>0.5,  I=>0, K=>1,  L=>0,   M=>0,
		  N=>0, P=>0,    Q=>0, R=>1,  S=>0,   T=>0,
		  V=>0, W=>0,    X=>0, Y=>0,  Z=>-0.5
		  );

    return $charge{$key} if (length $key);

    return %charge;
}

sub pK {
    my $key = shift;

    my %pK = (
	      C=>8.5,  D=>3.9,  E=>4.1,  H=>6.5,  K=>10.8,  R=>12.5,  Y=>10.1,
	      N_term=>8.6,   C_term=>3.6
	      );

    return $pK{$key} if (length $key);

    return %pK;
}

sub dayhoff {
    my $key = shift;

    my %dayhoff = (
		   A=>8.6, B=>0.0, C=>2.9, D=>5.5, E=>6.0, F=>3.6,
		   G=>8.4, H=>2.0, I=>4.5, K=>6.6, L=>7.4, M=>1.7,
		   N=>4.3, P=>5.2, Q=>3.9, R=>4.9, S=>7.0, T=>6.1,
		   V=>6.6, W=>1.3, X=>0.0, Y=>3.4, Z=>0.0
		   );

    return $dayhoff{$key} if (length $key);

    return %dayhoff;
}

=head2 G::Seq::AminoAcid::residue

  Name: G::Seq::AminoAcid::residue   -   retunrs the mass of a given amino acid residue

  Description:
    This method returns the mass of a given amino acid residue.

  Usage: 
    $mass = G::Seq::AminoAcid::residue($aminocid);
    eg. G::Seq::AminoAcid::residue('A'); # returns 71.0786

  Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010404-01 initial posting

=cut

sub residue{
    my $key = shift;

    my %residue = (
		   A=>71.0786,  B=>114.5960, C=>103.1386, D=>115.0884, E=>129.1152, F=>147.1762,
		   G=>57.0518,  H=>137.1408, I=>113.1590, K=>128.1736, L=>113.1590, M=>131.1926,
		   N=>114.1036, P=>97.1164,  Q=>128.1304, R=>156.1870, S=>87.0780,  T=>101.1048,
		   V=>99.1322,  W=>186.2128, X=>144.0000, Y=>163.1756, Z=>128.6228
		   );

    return $residue{$key} if (length $key);
    return %residue;
}

=head2 G::Seq::AminoAcid::mass

  Name: G::Seq::AminoAcid::mass   -   retunrs the mass of a given amino acid

  Description:
    This method returns the mass of a given amino acid.

  Usage: 
    $mass = G::Seq::AminoAcid::mass($aminocid);
    eg. G::Seq::AminoAcid::mass('A'); # returns 89.09

  Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010404-01 initial posting

=cut

sub mass{
    my $key = shift;

    my %mass = (
		A=>89.09,  C=>121.16, D=>133.10, E=>147.13, F=>165.19,
		G=>75.07,  H=>155.16, I=>131.17, K=>146.19, L=>131.17,
		M=>149.21, N=>132.12, P=>115.13, Q=>146.15, R=>174.20,
		S=>105.09, T=>119.12, V=>117.15, W=>204.22, Y=>181.19,
		X=>146,    Z=>146
		);

    return $mass{$key} if (length $key);    
    return %mass;
}

=head2 G::Seq::AminoAcid::hydropathy

  Name: G::Seq::AminoAcid::hydropathy   -   returns the hydropathy of given amino acid

  Description:
    This method returns the hydropathy of a given amino acid.

  Usage: 
    $hydropathy = G::Seq::AminoAcid::hydropathy($aminocid);
    eg. G::Seq::AminoAcid::hydropathy('A'); # returns 1.8;

  Options:
    none

  Author: 
    Haruo Suzuki (hs568@cornell.edu)

  References:
   1. Kyte J, Doolittle RF (1982) "A simple method for displaying the 
      hydropathic character of a protein", J Mol Biol, 157(1):105-32. 

  History:
    20091204-01 initial posting

=cut

sub hydropathy{
    my $key = shift;

    my %hydropathy = (
		      A=> 1.8, C=> 2.5, D=>-3.5, E=>-3.5, F=> 2.8,
		      G=>-0.4, H=>-3.2, I=> 4.5, K=>-3.9, L=> 3.8,
		      M=> 1.9, N=>-3.5, P=>-1.6, Q=>-3.5, R=>-4.5,
		      S=>-0.8, T=>-0.7, V=> 4.2, W=>-0.9, Y=>-1.3
		      );
    
    return $hydropathy{$key} if (length $key);    
    return %hydropathy;
}

=head2 one2three

  Name: one2three   -   converts one letter amino acid code to three letter code

  Description:
    This method converts one letter amino acid code to three letter code.

  Usage: 
    $amino = one2three($aminocid);
    eg. one2three('A'); # returns 'Ala';

  Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20101105-01 added support for amino acid sequence
    20010404-01 initial posting

=cut

sub one2three{
    my $key = shift;
    my $ret = '';

    my %one2three = ( 
		      A=>"Ala", B=>"Asx", C=>"Cys", D=>"Asp", E=>"Glu", F=>"Phe",
		      G=>"Gly", H=>"His", I=>"Ile", K=>"Lys", L=>"Leu", M=>"Met",
		      N=>"Asn", P=>"Pro", Q=>"Gln", R=>"Arg", S=>"Ser", T=>"Thr",
		      V=>"Val", W=>"Trp", X=>"Xaa", Y=>"Tyr", Z=>"Glx",
		      U=>"Sec", '/'=>"TER"
		      );

    for my $char (split(//, uc($key))){
	$ret .= $one2three{$char};
    }

    return $ret if (length $key);
    return %one2three;
}

=head2 three2one

  Name: three2one   -   converts three letter amino acid code to one letter code

  Description:
    This method converts three letter amino acid code to one letter code.

  Usage: 
    $amino = three2one($aminocid);
    eg. three2one('Ala'); # returns 'A';

 Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20101105-01 added support for amino acid sequence
    20010404-01 initial posting

=cut

sub three2one{
    my $key = shift;
    my $ret = '';
    my %three2one = reverse one2three();

    for(my $i = 0; $i < length($key); $i ++){
	$ret .= $three2one{substr($key, $i, 3)};
    }
    return $ret if (length $key);
    return reverse one2three();
}


=head2 calc_pI

  Name: calc_pI   -   calculates the isoelectric point of given amino acid sequence

  Description:
    This method calculates the isoelectric point of given amino acid sequence.

  Usage: 
    $pI = calc_pI($aminoseq);

 Options:
    none

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010404-01 initial posting

=cut

sub calc_pI{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $pI = 7.0;
    my $tmpcharge = 0;
    my $step = 3.5;
    my %num;
    my %pK = pK();
    $num{C} = uc($gb->{SEQ}) =~ tr/C/C/;
    $num{D} = uc($gb->{SEQ}) =~ tr/D/D/;
    $num{E} = uc($gb->{SEQ}) =~ tr/E/E/;
    $num{H} = uc($gb->{SEQ}) =~ tr/H/H/;
    $num{K} = uc($gb->{SEQ}) =~ tr/K/K/;
    $num{R} = uc($gb->{SEQ}) =~ tr/R/R/;
    $num{Y} = uc($gb->{SEQ}) =~ tr/Y/Y/;

    local *_charge = sub {
	my $first = shift;
	my $second = shift;
	my $cr = 10 ** ($first - $second);
	return $cr / (1 + $cr);
    };

    while(1){
	my $charge = 
	  - _charge($pI, $pK{C_term})
	  - $num{C} * _charge($pI, $pK{C})
          - $num{D} * _charge($pI, $pK{D})
      	  - $num{E} * _charge($pI, $pK{E})
	  - $num{Y} * _charge($pI, $pK{Y})
	  + $num{H} * _charge($pK{H}, $pI)
          + $num{K} * _charge($pK{K}, $pI)
	  + $num{R} * _charge($pK{R}, $pI)
   	  + _charge($pK{N_term}, $pI);

	last if(sprintf("%.5f", $charge) == sprintf("%.5f", $tmpcharge));

	if($charge > 0){
	    $pI += $step;
	}else{
	    $pI -= $step;
	}

	$tmpcharge = $charge;
	$step /= 2;
    }

    return $pI;
}


    
=head2 amino_info

  Name: amino_info   -   prints out basic amino acid sequence statistics

  Description:
    This method prints out basic compositional statistics of the 
    given amino acid sequence in a human readable manner.

    Displayed information includes the following:
      Molecular weight, number of residues, average residue weight, charge,
      isoelectric point, number/mole/DayhoffStat of each amino acid, and
      percentage of Tiny (A+C+G+S+T), Small (A+B+C+D+G+N+P+S+T+V), 
      Aliphatic (I+L+V), Aromatic (F+H+W+Y), Non-polar (A+C+F+G+I+L+M+P+V+W+Y), 
      Polar (D+E+H+K+N+Q+R+S+T+Z), Charged (B+D+E+H+K+R+Z), Basic (H+K+R),
      Acidic (B+D+E+Z) residues.

  Usage: 
    null = amino_info($sequence);

 Options:
    none

  Author: 
    Misa Nakanishi (dora@g-language.org)
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010404-01 initial posting

=cut

sub amino_info{
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args); 

    my $output = opt_val("output");

    my %amino;
    my ($mw, $chrg, $iso);
    my %charge = charge();
    my %mass = mass();
    my %dayhoff = dayhoff();
    my %one2three = one2three();

    foreach my $char (split(//, $gb->{SEQ})){ 
	$amino{uc($char)}++; 
	$mw += $mass{uc($char)}; 
	$chrg += $charge{uc($char)}; 
    }
	    
    msg_send(sprintf("AminoInfo of %s from 1 to %d\n\n", 
		     $gb->{LOCUS}->{id}, length($gb->{SEQ})));
    msg_send(sprintf("Molecular weight = %.2f Residues = %d\n", 
	     "$mw", length($gb->{SEQ})));
    msg_send(sprintf("Average Weight = %.2f Charge = %s\n", 
	     $mw/length($gb->{SEQ}), "$chrg"));
    msg_send(sprintf("Isoelectric Point = %.4f\n", 
		     calc_pI($gb->{SEQ})));

    msg_send(sprintf("%-15.15s %-15.15s %-15.15s %15.15s\n", 
		     "Residue", "Number", "Mole\%", "DayhoffStat"));

    foreach my $char (sort keys %amino){
	msg_send(sprintf("%-15.15s %-15.15s %-15.15s %15.15s\n", 
			 sprintf("%s = %s", $char, $one2three{$char}),
			 $amino{$char},
			 sprintf("%.3f", $amino{$char} / length($gb->{SEQ}) * 100),
			 sprintf("%.3f", $dayhoff{$char} ? ($amino{$char} / length($gb->{SEQ}) / $dayhoff{$char} * 100) : 0)
		 ));
    } 
   
    msg_send("\nProperty        Residues                Number          Mole\%\n");
    msg_send(sprintf("Tiny            (A+C+G+S+T)             %-15.15s %.3f\n",
	     $amino{A}+$amino{C}+$amino{G}+$amino{S}+$amino{T},
	     ($amino{A}+$amino{C}+$amino{G}+$amino{S}+$amino{T})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Small           (A+B+C+D+G+N+P+S+T+V)   %-15.15s %.3f\n",
	     $amino{A}+$amino{B}+$amino{C}+$amino{D}+$amino{G}+$amino{N}+$amino{P}+$amino{S}+$amino{T}+$amino{V},
	     ($amino{A}+$amino{B}+$amino{C}+$amino{D}+$amino{G}+$amino{N}+$amino{P}+$amino{S}+$amino{T}+$amino{V})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Aliphatic       (I+L+V)                 %-15.15s %.3f\n",
	     $amino{I}+$amino{L}+$amino{V},
	     ($amino{I}+$amino{L}+$amino{V})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Aromatic        (F+H+W+Y)               %-15.15s %.3f\n",
	     $amino{F}+$amino{H}+$amino{W}+$amino{Y},
	     ($amino{F}+$amino{H}+$amino{W}+$amino{Y})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Non-polar       (A+C+F+G+I+L+M+P+V+W+Y) %-15.15s %.3f\n",
	     $amino{A}+$amino{C}+$amino{F}+$amino{G}+$amino{I}+$amino{L}+$amino{M}+$amino{P}+$amino{V}+$amino{W}+$amino{Y},
	     ($amino{A}+$amino{C}+$amino{F}+$amino{G}+$amino{I}+$amino{L}+$amino{M}+$amino{P}+$amino{V}+$amino{W}+$amino{Y})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Polar           (D+E+H+K+N+Q+R+S+T+Z)   %-15.15s %.3f\n",
	     $amino{D}+$amino{E}+$amino{H}+$amino{K}+$amino{N}+$amino{Q}+$amino{R}+$amino{S}+$amino{T}+$amino{Z},
	     ($amino{D}+$amino{E}+$amino{H}+$amino{K}+$amino{N}+$amino{Q}+$amino{R}+$amino{S}+$amino{T}+$amino{Z})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Charged         (B+D+E+H+K+R+Z)         %-15.15s %.3f\n",
	     $amino{B}+$amino{D}+$amino{E}+$amino{H}+$amino{K}+$amino{R}+$amino{Z},
	     ($amino{B}+$amino{D}+$amino{E}+$amino{H}+$amino{K}+$amino{R}+$amino{Z})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Basic           (H+K+R)                 %-15.15s %.3f\n",
	     $amino{H}+$amino{K}+$amino{R},
	     ($amino{H}+$amino{K}+$amino{R})/length($gb->{SEQ})*100
	     ));
    msg_send(sprintf("Acidic          (B+D+E+Z)               %-15.15s %.3f\n",
	     $amino{B}+$amino{D}+$amino{E}+$amino{Z},
	     ($amino{B}+$amino{D}+$amino{E}+$amino{Z})/length($gb->{SEQ})*100
	     ));
}

 
1;

