#!/usr/bin/env perl

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: ORF.pm,v 1.2 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::ORF;

use SubOpt;
use G::Seq::Util;
use G::Messenger;

use strict;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);

use SelfLoader;

require Exporter;

@ISA = qw(Exporter AutoLoader);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
@EXPORT = qw(
	     longest_ORF
	     find_identical_gene
	     pseudo_atg
);
$VERSION = '0.01';


__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::


# longest_ORF ver.20010829-01
# Author: Kazuharu Arakawa
# Usage: &longest_ORF(pointer G instance); 
# Options:
# -output   "f" for file output
# -length   minimum number of amino acids in ORF (default:20)
# Description:
#  Searches longest ORF
# Requirements:
#  SubOpt.pm

sub longest_ORF {
    &opt_default(length=>20);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = $gb->{SEQ};
    my ($start,$end,$i);
    my $count = 0;

    if (opt_val("output") eq "f"){
	open(OUT, '>longestORF.csv') || &msg_error($!);
	print OUT "number, start codon, stop codon, length, start, end,",
	          "direction\n";
    }

    for ($i = 0; $i <= 1; $i ++){
	$seq = _complement($gb->{SEQ}) if ($i);
	$start = 0;
	$end = 0;
	
	while(0 <= ($start = index($seq, 'atg', $start + 1))){
	    next if ($start < $end && ($start - $end + 1) % 3 == 0);
	    $count ++;
	    my $tmp = $start;
	    my $stopcodon = '';
	    while(0 <= ($tmp = index($seq, 'tag', $tmp +1))){
		if (($tmp - $start + 1) % 3 == 0){
		    $end = $tmp;
		    $stopcodon = 'tag';
		    last;
		}
	    }
	    $tmp = $start;
	    while(0 <= ($tmp = index($seq, 'taa', $tmp +1))){
		if (($tmp - $start + 1) % 3 == 0){
		    if ($tmp < $end){
			$end = $tmp;
			$stopcodon = 'taa';
			last;
		    }else{
			last;
		    }
		}
	    }
	    $tmp = $start;
	    while(0 <= ($tmp = index($seq, 'tga', $tmp +1))){
		if (($tmp - $start + 1) % 3 == 0){
		    if ($tmp < $end){
			$end = $tmp;
			$stopcodon = 'tga';
			last;
		    }else{
			last;
		    }
		}
	    }
	    if ($i){
		if ($end > 0 && ($end - $start + 1) / 3 > opt_val("length")){
		    &msg_send(sprintf("%5d: atg %s %5d %7d %7d  complement\n", 
		    $count, $stopcodon, ($end - $start + 1),
		    length($gb->{SEQ}) - $end + 1, 
		    length($gb->{SEQ}) - $start + 1));
 
		    if (opt_val("output") eq "f"){
			printf OUT "%d,atg,%s,%d,%d,%d,complement\n", 
			$count, $stopcodon, ($end - $start + 1),
			length($gb->{SEQ}) - $end + 1, 
			length($gb->{SEQ}) - $start + 1; 
		    }
		}

	    }else{
		if ($end > 0 && ($end - $start + 1) / 3 > opt_val("length")){
		    &msg_send(sprintf("%5d: atg %s %5d %7d %7d  direct\n", 
		    $count, $stopcodon, ($end - $start + 1), 
		    $start + 1, $end + 1));

		    if (opt_val("output") eq "f"){
			printf OUT "%d,atg,%s,%d,%d,%d,direct\n", 
			$count, $stopcodon, ($end - $start + 1), 
			$start + 1, $end + 1;
		    }
		}
	    }	
	}
    }
    close(OUT) if (opt_val("output") eq "f");
}


sub pseudo_atg {
    &opt_default(length=>10);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my (@before, @after);
    my $len = opt_val("length");

    my $i = 1;
    while(defined(%{$gb->{"CDS$i"}})){
	my $seq = $gb->get_geneseq("CDS$i");
	my $cdsstart = $gb->{"CDS$i"}->{start} - 1;
	my $cdsend = $gb->{"CDS$i"}->{end} - 1;
	my $cdsdir = $gb->{"CDS$i"}->{direction};

	my $start = 2;
	while(0 <= ($start = index($seq, 'atg', $start + 1))){
	    next unless ($start % 3 == 0);
	    my ($bef, $aft);
	    if ($cdsdir eq 'direct'){
		if ($cdsstart + $start < $len){
		    $bef = substr($gb->{SEQ}, 0, $cdsstart + $start);
		}else{
       		    $bef = substr($gb->{SEQ}, $cdsstart + $start - $len, 
				  $len);
		}

		if ($cdsstart + $start + 3 + $len > length($gb->{SEQ})){
		    $aft = substr($gb->{SEQ}, $cdsstart + $start + 3);
		}else{
		    $aft = substr($gb->{SEQ}, $cdsstart + $start + 3, $len);
		}

	    }else{
		if ($cdsend + 1 - $start + $len < length($gb->{SEQ})){
		    $bef = _complement(substr($gb->{SEQ}, 
					      $cdsend + 1 - $start));
		}else{
		    $bef = _complement(substr($gb->{SEQ}, 
					      $cdsend + 1 - $start, $len));
		}
		if ($cdsend - $start - 3 < $len){
		    $aft = _complement(
				       substr($gb->{SEQ}, 0, 
					      $cdsend - $start - 3)
				       );
		}else{
		    $aft = _complement(
				       substr($gb->{SEQ}, 
					      $cdsend - $start - 3 - $len, 
					      $len)
				       );
		}
	    }
		
	    push (@before, $bef);
	    push (@after, $aft);
	}
	$i ++;
    }
    return (\@before, \@after)
}


# find_identical_gene ver.20010829-01
# Author: Kazuharu Arakawa
# Usage: array identical = &find_identical_gene(pointer G instance1,
#        pointer G instance2);
# Options:
#  -print  1 for standard output, and 0 for none.
# Description:
#   This program finds identical genes in 2 genomes.
#   Returned array is an array of CDS numbers of genome 1
# Requirements:
#   SubOpt.pm

sub find_identical_gene {
    my @args = opt_get(@_);
    my $gb1 = opt_as_gb(shift @args);
    my $gb2 = opt_as_gb(shift @args);

    my $cds1 = 1;
    my $cds2 = 1;
    my @identical = ();

    while(defined(%{$gb1->{"CDS$cds1"}})){
	my $gene1 = $gb1->get_geneseq("CDS$cds1");
	while(defined(%{$gb2->{"CDS$cds2"}})){
	    my $gene2 = $gb2->get_geneseq("CDS$cds2");

	    if ($gene1 eq $gene2){
		push (@identical, $cds1);
		
		if (opt_val("print")){
		    &msg_send(sprintf("[%d:%d-%d] ", $cds1,
		    $gb1->{"CDS$cds1"}->{start},
		    $gb1->{"CDS$cds1"}->{end}));

		    if ($gb2->{"CDS$cds2"}->{"direction"} eq "complement"){
			&msg_send(sprintf("complement\(%d..%d\)\n", 
			$gb2->{"CDS$cds2"}->{start}, 
			$gb2->{"CDS$cds2"}->{end}));
		    }else{
			&msg_send(sprintf("%d..%d\n", 
			$gb2->{"CDS$cds2"}->{start}, 
			$gb2->{"CDS$cds2"}->{end}));
		    }			

		    my $i = $gb2->{"CDS$cds2"}->{feature};
		    &msg_send(sprintf("           gene=\"%s\"\n", 
		    $gb2->{"FEATURE$i"}->{gene})) 
		    if ($gb2->{"FEATURE$i"}->{gene}) ;

		    &msg_send(sprintf("           product=\"%s\"\n",
		    $gb2->{"FEATURE$i"}->{product})) 
		    if ($gb2->{"FEATURE$i"}->{product});
		}
	    }
	    $cds2 ++;
	}
	$cds2 = 1;
	$cds1 ++;
    }
    return \@identical;
}


sub DESTROY {
    my $self = shift;
}

1;
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

G::Seq::OverLapping - Perl extension for blah blah blah

=head1 SYNOPSIS

  use G::Seq::ORF;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for G::Seq::ORF was created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head1 AUTHOR

A. U. Thor, a.u.thor@a.galaxy.far.far.away

=head1 SEE ALSO

perl(1).

=cut
