#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2014 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: G::IO::Handler.pm,v 1.1 2002/07/30 17:44:27 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package G::IO::Handler;

use strict;
use Carp;

use G::Messenger;
use G::Seq::Primitive;
use G::Seq::AminoAcid;
use G::DB::SDB;
use File::ShareDir ':ALL';

use autouse 'File::Temp'=>qw(tempfile);
use autouse 'LWP::Simple'=>qw(mirror);

#::::::::::::::::::::::::::::::
#          Perldoc
#::::::::::::::::::::::::::::::


=head1 NAME

 G::IO::Handler - Internal class with basic sequence manipulation methods

=head1 SYNOPSIS

 require G::IO::Handler;
 use base qw(G::IO::Handler);
   
=head1 DESCRIPTION

 Intended for internal use only. Super class for the core. Provides 
 the native methods.

=head1 AUTHOR

Kazuharu Arakawa, gaou@sfc.keio.ac.jp

=cut


#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::

#::::::::: Internal :::::::::::

sub set_gene_aliases{
    my $this = shift;

    foreach my $feat ($this->feature()){
	next unless ($this->{$feat}->{type} =~ /CDS|RNA/);
	
	if(length $this->{$feat}->{gene}){
	    $this->{$this->{$feat}->{gene}} = $this->{$feat};
	}
	
	if(length $this->{$feat}->{locus_tag}){
	    $this->{$this->{$feat}->{locus_tag}} = $this->{$feat};
	}
	
	if($this->{$feat}->{type} eq 'CDS'){
	    $this->{'CDS' . $this->{$feat}->{cds}} = $this->{$feat};
	}
    }
}

#::::::::: Class Method ::::::::::

=head2 $gb->filepath()

   Name: $gb->filepath()   -   get the location path of input file

   Description:
         Returns the file path used to load the sequence.

   REST: 
      http://rest.g-language.org/ecoli/filepath

=cut

sub filepath {
    return $_[0]->{FILENAME};
}

=head2 $gb->clone()

   Name: $gb->clone()   -   create a copy of the G instance

   Description:
         Returns cloned G instance, which is a new G instance with
         identical data. 

   REST: 
      http://rest.g-language.org/NC_000913/clone

=cut

sub clone {
    my $this = shift;
    my $sdbPath = _sdb_path();

    _set_sdb_path('/tmp');

    my $tmpfile = "GINTERNAL-" . time() . rand();
    sdb_save($this,$tmpfile);
    my $new = sdb_load($tmpfile);

    _set_sdb_path($sdbPath);
    unlink($tmpfile);

    return $new;
}


=head2 $gb->annotate()

   Name: $gb->annotate()   -   reannotate the genome using Restauro-G

   Description:
      This method reannotates the entire genome, if coding sequences are already available.
      Added features includes:
      rs_des: description of protein
      rs_xr:  database cross references, including EMBL, RefSeq, KEGG, BioCyc, BRENDA, GO, InterPro, Pfam
      rs_com: comments
  Usage: 
      $new_gb = $gb->annotate();

   REST: 
      http://rest.g-language.org/NC_000913/annotate

   References:
      1. Tamaki S, Arakawa K, Kono N, Tomita M (2007) "Restauro-G: A Rapid Genome Re-Annotation 
         System for Comparative Genomics", Genomics Proteomics Bioinformatics, 5(1): 53-58.

   Author:
      Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

   History:
      20120830-01 initial posting

=cut

sub annotate {
    my $this = shift;

    my($fh, $outfile) = tempfile(undef, SUFFIX=>'.gbk');
    $this->make_gb($outfile);

    require LWP::UserAgent;
    my $ua = LWP::UserAgent->new;
    my $id = $ua->post('http://rest.g-language.org/upload/upl.pl', 'Content_Type'=>'form-data', 'Content'=>['file'=>[$outfile]])->content;
    mirror("http://rest.g-language.org/$id/restauro", $outfile);
    my $new = new G::IO($outfile, "no cache", "no msg");
    return $new;
}

=head2 $gb->disclose()

   Name: $gb->disclose()   -   create a tab delimited list of all features

   Description:
         Returns tab delimited list of all feature table.
         Used for Bio::Glite.

   REST: 
      http://rest.g-language.org/NC_000913/disclose

=cut


sub disclose {
    my $gb = shift;

    my $return = '';
    foreach my $level1 (keys %{$gb}){
	next if ($level1 eq 'SEQ');
	next unless($level1 =~ /LOCUS|FEATURE|HEADER/);
	if(ref($gb->{$level1}) =~ /HASH/){
	    foreach my $level2 (keys %{$gb->{$level1}}){
		$return .= join("\t", $level1, $level2, $gb->{$level1}->{$level2}) . "\n";
	    }
	}else{
	    $return .= join("\t", $level1, $gb->{$level1}) . "\n";
	}
    }

    return $return;
}


=head2 $gb->find()

   Name: $gb->find()   -   search through the genome data object with keywords

   Description:
         This method provides powerful means to search within the G-language genome
         data object with keywords. Given a set of keywords, this method returns
         the list of feature IDs corresponding to the search query. In G-language Shell,
         search results are also directly printed out (you can specify -print=>1 option
         in API mode to print within your program).

         eg. @features = $gb->find('RNA', 'tyrosine');    # multiple keywords are allowed.

         Keywords can be specific to each of the feature attributes:

         eg. $gb->find(-type=>'CDS', -product=>'metabolism', 'subunit');

         Regular expressions are allowed for keywords:

         eg. $gb->find(-type=>'CDS', -EC_number=>'^2.7.');

   REST: 
      http://rest.g-language.org/NC_000913/product=metabolism/

=cut


sub find {
    my $this = shift;

    my @args = @_;
    my (@keywords, %keyhash, @results);
    my $i = 0;
    my $print = 0;

    while(defined $args[$i]){
        if (substr($args[$i], 0, 1) eq '-' && substr($args[$i], 1, 1) !~ /[0-9]/){
	    if (substr($args[$i],1) eq 'print'){
		$print = 1;
		$i +=2;
	    }else{
		$keyhash{substr($args[$i],1)} = $args[$i + 1];
		$i += 2;
	    }
        }else{
	    push(@keywords, $args[$i]);
            $i ++;
        }
    }

    foreach my $feat ($this->feature()){
	my $flag = 0;

	foreach my $key (keys %keyhash){
	    my $val = $keyhash{$key};

	    unless($this->{$feat}->{$key} =~ /$val/i){
		$flag = 1;
		last;
	    }
	}

	next if ($flag);

	foreach my $key (@keywords){
	    unless(join('%%%___%%%', values(%{$this->{$feat}})) =~ /$key/i){
		$flag = 1;
		last;
	    }
	}
    
	push(@results, $feat) unless($flag);
    }

    if(msg_ask_interface() eq 'Shell' || $print){
	foreach my $feat (@results){
	    my $gene = $this->{$feat}->{gene} || $this->{$feat}->{locus_tag} || $feat;
	    my $ec = $this->{$feat}->{EC_number};
	    $ec =~ s/\s+/,/g;
	    $ec = '(' . $ec . ')' if (length $ec);

	    msg_send(
		     sprintf("     %s\t%s\t%s\t%s..%s\t%s\t%s %s\n", $feat, $gene, $this->{$feat}->{type}, 
			     $this->{$feat}->{start}, $this->{$feat}->{end}, $this->{$feat}->{direction}, $this->{$feat}->{product}, $ec)
		     );
	}
    }

    return @results;
}


=head2 $gb->getseq()

   Name: $gb->getseq()   -   get nucleotide sequence of the given positions (Perl coordinates)

   Description:
         Given the start and end positions (starting from 0 as in Perl),
         returns the sequence specified.

         eg. $gb->getseq(2,4); # returns the 3rd, 4th, and 5th nucleotides.

   Options:
       -circular   when the first position is larger than the second position,
                   retrieves the sequece spanning across the end of the circular
                   chromosome. (ex: $gb->getseq(4639670, 5, -circular))

   REST: 
      http://rest.g-language.org/NC_000913/getseq/1/3


=cut

sub getseq {
    if (scalar(@_) < 3){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $start = shift;
    my $end = shift;
    my $option = shift;

    if($start < $end){
	return substr($this->{SEQ}, $start, $end-$start+1);
    }else{
	if($option =~ /circ/){
	    return substr($this->{SEQ}, $start) . substr($this->{SEQ}, 0, $end + 1);
	}else{
	    my ($start2, $end2) = sort {$a <=> $b} ($start, $end);
	    return substr($this->{SEQ}, $start2, $end2-$start2+1);
	}
    }
}


=head2 $gb->get_gbkseq()

   Name: $gb->get_gbkseq()   -   get nucleotide sequence of the given positions (GenBank coordinates)

   Description:
         Given the start and end positions (starting from 1 as in 
         Genbank), returns the sequence specified.

         eg. $gb->get_gbkseq(1,3); # returns the 1st, 2nd, and 3rd nucleotides.

   Options:
       -circular   when the first position is larger than the second position,
                   retrieves the sequece spanning across the end of the circular
                   chromosome. (ex: $gb->getseq(4639670, 5, -circular))

   REST: 
      http://rest.g-language.org/NC_000913/get_gbkseq/1/3

=cut

sub get_gbkseq {
    if (scalar(@_) < 3){
	carp("Not enough arguments.");
	return;
    }
    return getseq($_[0], $_[1] - 1, $_[2] - 1, $_[3]);
}


=head2 $gb->get_cdsseq()

   Name: $gb->get_cdsseq()   -   get nucleotide sequence of the given CDS

   Description:
         Given a CDS ID, returns the CDS sequence. 
         'complement' is properly parsed.

         eg. $gb->get_cdsseq('recA'); # returns the 'recA' gene sequence.

   REST: 
      http://rest.g-language.org/NC_000913/get_cdsseq/recA


=cut

sub get_cdsseq {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;

    my $cdsseq = '';

    if($this->{$object}->{start} > $this->{$object}->{end}){
	$cdsseq = substr($this->{SEQ}, $this->{$object}->{start} - 1) . 
	    $this->get_gbkseq(1, $this->{$object}->{end});
    }else{
	$cdsseq = $this->get_gbkseq($this->{$object}->{start}, 
				    $this->{$object}->{end});
    }

    $cdsseq = &complement($cdsseq) 
	if ($this->{$object}->{direction} eq 'complement');

    return $cdsseq;
}

=head2 $gb->around_startcodon()

   Name: $gb->around_startcodon()   -   get the sequence around the startcodon of the given CDS

   Description:
         Given a CDS ID, lengths before startcodon and after start codon, 
         returns the sequence around of start codon.

         eg. $gb->around_startcodon('FEATURE5239', 100, 100); 
             # returns 100 bp sequence before and after the start codon of 'CDS1',
             # including the start codon itself

   Options:
         Optional Fourth argument containing a string "without" returns only the sequence
         before and after the start codon, and without the stat codon itself.
         eg. $gb->around_startcodon('FEATURE5239', 100, 100, "without");

   REST: 
      http://rest.g-language.org/NC_000913/around_startcodon/recA/200/200


=cut


sub around_startcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $gb = shift;
    my $cds = shift;
    my $before = shift || 0;
    my $after = shift || 0;
    my $option = shift;

    my $seq  = $gb->before_startcodon($cds, $before);
    $seq .= $gb->startcodon($cds) unless($option =~ /without/);
    $seq .= $gb->after_startcodon($cds, $after);

    return $seq;
}


=head2 $gb->around_stopcodon()

   Name: $gb->around_stopcodon()   -   get the sequence around the stopcodon of the given CDS

   Description:
         Given a CDS ID, lengths before stopcodon and after stop codon, 
         returns the sequence around of stop codon.

         eg. $gb->around_stopcodon('FEATURE5239', 100, 100); 
             # returns 100 bp sequence before and after the stop codon of 'CDS1',
             # including the stop codon itself

   Options:
         Optional Fourth argument containing a string "without" returns only the sequence
         before and after the stop codon, and without the stat codon itself.
         eg. $gb->around_stopcodon('FEATURE5239', 100, 100, "without");

   REST: 
      http://rest.g-language.org/NC_000913/around_startcodon/recA/200/200

=cut

sub around_stopcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $gb = shift;
    my $cds = shift;
    my $before = shift || 0;
    my $after = shift || 0;
    my $option = shift;

    my $seq  = $gb->before_stopcodon($cds, $before);
    $seq .= $gb->stopcodon($cds) unless($option =~ /without/);
    $seq .= $gb->after_stopcodon($cds, $after);

    return $seq;
}


=head2 $gb->before_startcodon()

   Name: $gb->before_startcodon()   -   get the upstream sequence of the given CDS

   Description:
         Given a CDS ID and length, returns the sequence upstream of 
         start codon.

         eg. $gb->before_startcodon('CDS1', 100); 
             # returns 100 bp sequence upstream of the start codon of 'CDS1'.

   Options:
         Second argument specifying the length of sequence to retrieve is
         optional. (default: 100).

   REST: 
      http://rest.g-language.org/NC_000913/before_startcodon/recA/200


=cut


sub before_startcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;
    my $length = shift || 100;

    if ($this->{$object}->{direction} eq 'complement'){
	return complement(substr($this->{SEQ}, $this->{$object}->{end}, $length));
    }else{
	my $start = $this->{$object}->{start} - 1 - $length;
	if ($start < 0){
	    $start = 0;
	    $length = $this->{$object}->{start} - 1;
	}
	return substr($this->{SEQ}, $start, $length);
    }
}

=head2 $gb->after_startcodon()

   Name: $gb->after_startcodon()   -   get the sequence downstream of start codon of the given CDS

   Description:
         Given a CDS ID and length, returns the sequence downstream of 
         start codon.

         eg. $gb->after_startcodon('CDS1', 100); 
             # returns 100 bp sequence downstream of the start codon of 'CDS1'.

   Options:
         Second argument specifying the length of sequence to retrieve is
         optional. (default: 100).

   REST: 
      http://rest.g-language.org/NC_000913/after_startcodon/recA/200

=cut

sub after_startcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;
    my $length = shift || 100;

    if ($this->{$object}->{direction} eq 'complement'){
	my $start = $this->{$object}->{end} - 1 - 3 - $length + 1;
	if($start < 0){
	    $start = 0;
	    $length = $this->{$object}->{end} - 3;
	}
	return complement(substr($this->{SEQ}, $start, $length));
    }else{
	return substr($this->{SEQ}, $this->{$object}->{start} + 3 - 1, $length);
    }
}


=head2 $gb->before_stopcodon()

   Name: $gb->before_stopcodon()   -   get the sequence upstream of stop codon of the given CDS

   Description:
         Given a CDS ID and length, returns the sequence upstream of 
         stop codon.

         eg. $gb->before_stopcodon('CDS1', 100); 
             # returns 100 bp sequence upstream of the stop codon of 'CDS1'.

   Options:
         Second argument specifying the length of sequence to retrieve is
         optional. (default: 100).

   REST: 
      http://rest.g-language.org/NC_000913/before_stopcodon/recA/200


=cut


sub before_stopcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;
    my $length = shift || 100;

    if ($this->{$object}->{direction} eq 'complement'){
	return complement(substr($this->{SEQ}, $this->{$object}->{start} + 3 - 1, $length));
    }else{
	my $start = $this->{$object}->{end} - 3 - 1 - $length + 1;
	if($start < 0){
	    $start = 0;
	    $length = $this->{$object}->{end} - 3;
	}
	return substr($this->{SEQ}, $start, $length);
    }
}

=head2 $gb->after_stopcodon()

   Name: $gb->after_stopcodon()   -   get the downstream sequence of the given CDS

   Description:
         Given a CDS ID and length, returns the sequence downstream of 
         stop codon.

         eg. $gb->after_stopcodon('CDS1', 100); 
             # returns 100 bp sequence downstream of the stop codon of 'CDS1'.

   Options:
         Second argument specifying the length of sequence to retrieve is
         optional. (default: 100).

   REST: 
      http://rest.g-language.org/NC_000913/after_stopcodon/recA/200


=cut

sub after_stopcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;
    my $length = shift || 100;

    if ($this->{$object}->{direction} eq 'complement'){
	my $start = $this->{$object}->{start} - 1 - $length;
	if($start < 0){
	    $start = 0;
	    $length = $this->{$object}->{start};
	}
	return complement(substr($this->{SEQ}, $start, $length));
    }else{
	return substr($this->{SEQ}, $this->{$object}->{end} +1 - 1, $length);
    }
}

=head2 $gb->startcodon()

   Name: $gb->startcodon()   -   get the start codon of the given CDS

   Description:
         Given a CDS ID, returns the start codon.

         eg. $gb->startcodon("FEATURE$i"); # returns 'atg'

   REST: 
      http://rest.g-language.org/NC_000913/startcodon/recA


=cut

sub startcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    return substr($_[0]->get_geneseq($_[1]), 0, 3);
}


=head2 $gb->stopcodon()

   Name: $gb->stopcodon()   -   get the stop codon of the given CDS

   Description:
         Given a CDS ID, returns the stop codon.

         eg. $gb->stopcodon("FEATURE$i"); # returns 'tag'

   REST: 
      http://rest.g-language.org/NC_000913/stopcodon/recA


=cut

sub stopcodon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    return substr($_[0]->get_geneseq($_[1]), -3, 3);
}

=head2 $gb->pos2feature()

   Name: $gb->pos2feature()   -   get a feature ID from position

   Description:
         Given a GenBank position (sequence starting from position 1) 
         returns the feature ID (ex. FEATURE123) of the feature at
         the given position. If multiple features exist for the given
         position, the first feature to appear is returned. Returns 
         NULL if no feature exists.

         When two positions are specified, all features within given 
         range is returned as an array of feature IDs.

   REST: 
      http://rest.g-language.org/NC_000913/pos2feature/1024


=cut

sub pos2feature {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $pos = shift;
    my $pos2 = shift;

    if(length($pos2) > 1){
	my @genes = ();

	foreach my $feat ($this->feature()){
	    next if ($feat eq 'FEATURE0');

	    if ($this->{$feat}->{end} >= $pos && $pos2 >= $this->{$feat}->{start}){
		push(@genes, $feat);
	    }elsif ($pos2 < $this->{$feat}->{start}){
		return @genes;
	    }
	}

    }else{

	foreach my $feat ($this->feature()){
	    next if ($feat eq 'FEATURE0');

	    if ($pos >= $this->{$feat}->{start} && $pos <= $this->{$feat}->{end}){
		return $feat;
	    }elsif ($pos < $this->{$feat}->{start}){
		return '';
	    }
	}
    }
}

=head2 $gb->gene2id()

   Name: $gb->gene2id()   -   get a feature ID from canonical gene name

   Description:
         Given a GenBank gene name, returns the feature ID (ex. FEATURE123). 
         Returns NULL if no gene exists.

   REST: 
      http://rest.g-language.org/NC_000913/gene2id/recA

=cut

sub gene2id {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    return length($_[0]->{$_[1]}->{feature}) ? 'FEATURE' . $_[0]->{$_[1]}->{feature} : '';
}

sub feature2id {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }

    return 'FEATURE' . $_[0]->{$_[1]}->{feature};
}

=head2 $gb->pos2gene()

   Name: $gb->pos2gene()   -   get a feature ID of CDS from position

   Description:
         Given a GenBank position (sequence starting from position 1) 
         returns the feature ID (ex. FEATURE123) of the gene at
         the given position. If multiple genes exists for the given
         position, the first gene to appear is returned. Returns 
         NULL if no gene exists.

         When two positions are specified, all genes within given 
         range is returned as an array of feature IDs.

   REST: 
      http://rest.g-language.org/NC_000913/pos2gene/1024


=cut

sub pos2gene {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $pos = shift;
    my $pos2 = shift;

    if(length($pos2) > 1){
	my @genes = ();

	foreach my $feat ($this->cds()){
	    if ($this->{$feat}->{end} >= $pos && $pos2 >= $this->{$feat}->{start}){
		push(@genes, $feat);
	    }elsif ($pos2 < $this->{$feat}->{start}){
		return @genes;
	    }
	}

    }else{
	foreach my $feat ($this->cds()){
	    if ($pos >= $this->{$feat}->{start} && $pos <= $this->{$feat}->{end}){
		return $feat;
	    }elsif ($pos < $this->{$feat}->{start}){
		return '';
	    }
	}
    }
}


=head2 $gb->get_geneseq()

   Name: $gb->get_geneseq()   -   get nucleotide sequence of the given gene

   Description:
         Given a CDS ID, returns the CDS sequence, or the exon sequence
         If introns are present.
         'complement' is properly parsed, and introns are spliced out.
         Codon start position is used when codon_start key is available.

         eg. $gb->get_geneseq('recA'); # returns the 'recA' gene sequence or exon.

   REST: 
      http://rest.g-language.org/NC_000913/get_geneseq/recA


=cut

sub get_geneseq {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $object = shift;

    my $geneseq = $this->get_gbkseq($this->{$object}->{start}, 
				   $this->{$object}->{end});
    if ($this->{$object}->{join}){
	$geneseq = $this->get_exon($object);
    }elsif ($this->{$object}->{direction} eq 'complement'){
	$geneseq = &complement($geneseq);
    }

    my $codon_start = $this->{$object}->{codon_start} || 1;
    return substr($geneseq, $codon_start - 1);
}

=head2 $gb->get_intron()

   Name: $gb->intron()   -   get a list of intron sequences of the given CDS

   Description:
         Given a CDS ID, returns the intron sequences as array of 
         sequences.

         eg. $gb->get_intron('CDS1'); 
             # returns ($1st_intron, $2nd_intron,..)

   REST: 
      http://rest.g-language.org/NC_000913/get_intron/recA

=cut

sub get_intron {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $cds = shift;

    return unless (length $this->{$cds}->{join});

    my @join = split(/\.\./, $this->{$cds}->{join});
    shift @join;
    pop @join;
    my @seq;

    foreach my $line (@join){
	$line =~ s/c//g;
	my ($start, $end) = split(/,/, $line, 2);
	my $tmp = $this->get_gbkseq($start + 1, $end - 1);
	$tmp = '' if($end - 2  - $start < 0);
	push (@seq, $tmp);
    }

    return @seq;
}

=head2 $gb->get_exon()

   Name: $gb->get_exon()   -   get a concatenated exon sequence of the given CDS

   Description:
         Given a CDS ID, returns the exon sequence.
         'complement' is properly parsed, and introns are spliced out.

         eg. $gb->get_exon('CDS1'); returns the 'CDS1' exon.

   REST: 
      http://rest.g-language.org/NC_000913/get_exon/recA

=cut

sub get_exon {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $cds = shift;

    return unless (length $this->{$cds}->{join});

    my $seq = '';

    foreach my $line (split(/,/, $this->{$cds}->{join})){
	my $complement = $line =~ tr/c//d;
	my ($start, $end) = split(/\.\./, $line, 2);
	my $tmp = $this->get_gbkseq($start, $end);
	$tmp = complement($tmp) if ($complement);
	$seq .= $tmp;
    }

    $seq = complement($seq) if ($this->{$cds}->{direction} eq 'complement');
    return $seq;
}


=head2 $gb->get_exons()

   Name: $gb->get_exons()   -   get a list of exon sequences of the given CDS

   Description:
         Given a CDS ID, returns the sequences of exon sequence.
         'complement' is properly parsed, and introns are spliced out.

         eg. $gb->get_exons('CDS1'); returns the array of exon sequences.

   REST: 
      http://rest.g-language.org/NG_016275/get_exons/IL10RA

=cut

sub get_exons {
    if (scalar(@_) < 2){
	carp("Not enough arguments.");
	return;
    }
    my $this = shift;
    my $cds = shift;

    return unless (length $this->{$cds}->{join});

    my @seq;

    foreach my $line (split(/,/, $this->{$cds}->{join})){
	my $complement = $line =~ tr/c//d;
	my ($start, $end) = split(/\.\./, $line, 2);
	my $tmp = $this->get_gbkseq($start, $end);
	$tmp = complement($tmp) if ($complement);
	
	if($this->{$cds}->{direction} eq 'complement'){
	    unshift(@seq, complement($tmp));
	}else{
	    push(@seq, $tmp);
	}
    }

    return @seq;
}



=head2 $gb->seq_info()

   Name: $gb->seq_info()   -   display basic statistics about the data

   Description:
         Prints the basic information of the genome to STDOUT.

   REST: 
      http://rest.g-language.org/NC_000913/seq_info

=cut

sub seq_info {
    my $this = shift;
    my $nomsg = shift;
    my $type = shift;

    my $length = length($this->{SEQ});

    my $a = $this->{SEQ} =~ tr/a/a/;
    my $t = $this->{SEQ} =~ tr/t/t/;
    my $g = $this->{SEQ} =~ tr/g/g/;
    my $c = $this->{SEQ} =~ tr/c/c/;
    my $symbol = $this->{SEQ} =~ tr/0123456789 _-/0123456789 _-/;
    my $others = $length - $a - $t - $g - $c;
    my $msg;

    unless($nomsg){
	$msg .= sprintf "\n\nAccession Number: %s\n", $this->{LOCUS}->{id} if(length($this->{LOCUS}->{id}));
	$msg .= sprintf "\nDefinition: %s\n", $this->{DEFINITION} if(length($this->{DEFINITION}));
	$msg .= sprintf "\n  Merged LOCUS Count : %9d\n" , scalar(grep{/LOCUS/} keys(%$this)) - 2 if(exists($this->{LOCUS2}));
	$msg .= sprintf "\n  Length of Sequence : %9d\n" , $length;
	
	if($length == 0){
	    $msg .= sprintf "\n  No Sequence Found.\n\n";
	}elsif($symbol > 5){
	    $msg .= sprintf "\n  Sequence seems to be invalid (neither nucleotide nor amino acid).\n\n";
	}elsif($others > $a + $t + $g + $c && $type != 1){
	    amino_info($this);
	    return;
	}else{
	    $msg .= sprintf "           A Content : %9d (%.2f\%)\n" , $a , $a / $length * 100;
	    $msg .= sprintf "           T Content : %9d (%.2f\%)\n" , $t , $t / $length * 100;
	    $msg .= sprintf "           G Content : %9d (%.2f\%)\n" , $g , $g / $length * 100;
	    $msg .= sprintf "           C Content : %9d (%.2f\%)\n" , $c , $c / $length * 100;
	    $msg .= sprintf "              Others : %9d (%.2f\%)\n" , $others,  $others / $length * 100;
	    $msg .= sprintf "          AT Content :    %.2f\%\n"    , ($a + $t) / $length * 100;
	    $msg .= sprintf "          GC Content :    %.2f\%\n\n"  , ($g + $c) / $length * 100;

	    my ($genes, $cds, $trna, $rrna) = (scalar($this->gene()), scalar($this->cds()), scalar($this->tRNA()), scalar($this->rRNA()));
	    if($cds > 0){
		$msg .= sprintf "      Number of genes :    %d (CDSs: %s, tRNAs: %s, rRNAs: %s) \n"    , $genes, $cds, $trna, $rrna;
	    }
	}
	&msg_send($msg);
    }

    return ($a, $t, $g, $c);
}

=head2 $gb->disable_pseudogenes()

   Name: $gb->disable_pseudogenes()   -   turns all pseudogenes off

   Description:
         Turns off all pseudogenes by setting $genome->{$feature}->{on}
         to 0 when $genome->{$feature}->{pseudo} is true.

   REST: 
      http://rest.g-language.org/NC_000913/disable_pseudogenes

=cut


sub disable_pseudogenes {
    my $this = shift;

    foreach my $feature ($this->feature()){
	$this->{$feature}->{on} = 0 if ($this->{$feature}->{pseudo});
    }
}

=head2 $gb->feature()

   Name: $gb->feature()   -   get a list of feature IDs

   Description:
         Returns the array of all feature IDs.
         Features are ignored when $gb->{$feature}->{on} is 0.

         eg.
           foreach ($gb->feature()){
               $gb->get_cdsseq($_);
           }
           #prints all feature sequences.

         Optionally, feature type can be supplied to return only the
         specifies features.

         eg. $gb->feature("tRNA"); # returns feature IDs only for tRNAs

         Option of "all" always returns all features regardless of the
         value of $gb->{$feature}->{on}.

   REST: 
      http://rest.g-language.org/NC_000913/feature

=cut

sub feature {
    my $this = shift;
    my $type  = shift;
    my $opt = shift;

    if($type eq 'all'){
	$opt = 'all';
	$type = '';
    }

    my $i = -1;
    my @feature;

    while(defined($this->{"FEATURE" . ($i + 1)})){
        $i ++;

	if(length($type)){
	    next unless ($this->{"FEATURE$i"}->{type} eq $type);
	}

	if ($opt ne 'all' && defined $this->{"FEATURE$i"}->{on}){
	    next if ($this->{"FEATURE$i"}->{on} == 0);
	}

        push (@feature, "FEATURE$i");
    }

    return @feature;
}

=head2 $gb->rRNA()

   Name: $gb->rRNA()   -   get a list of feature IDs of rRNAs

   Description:
         Returns the array of all feature IDs of rRNAs.

   Options:
         Second optional argument can specify the rRNA type 
         (5S, 16S, 23S, 5.8S, 18S, 28S). When used in scalar context,
         only the longest match is returned.

         ex: 
            @SSU = $gb->rRNA('16S');
            $longestSSU = $gb->rRNA('16S');

   REST: 
      http://rest.g-language.org/NC_000913/rRNA/16S


=cut

sub rRNA {
    my $this = shift;
    my $option = shift;

    if($option =~ /\d+S|SSU|LSU/){
	my %rRNA;

	foreach ($this->feature('rRNA')){
	    if($this->{$_}->{product} =~ /([0-9\.]+S)/i){
		push(@{$rRNA{uc($1)}}, $_);
	    }elsif($this->{$_}->{gene} =~ /rrf/){
		push(@{$rRNA{'5S'}}, $_);
	    }elsif($this->{$_}->{gene} =~ /rrl/){
		push(@{$rRNA{'23S'}}, $_);
	    }elsif($this->{$_}->{gene} =~ /rrs/){
		push(@{$rRNA{'16S'}}, $_);
	    }elsif($this->{$_}->{gene} =~ /([0-9\.]+S)/i){
		push(@{$rRNA{uc($1)}}, $_);
	    }elsif($this->{$_}->{locus_tag} =~ /([0-9]+S)/i){
		push(@{$rRNA{uc($1)}}, $_);
	    }elsif($this->{$_}->{note} =~ /([0-9\.]+S)+?/){
		push(@{$rRNA{uc($1)}}, $_);
	    }elsif($this->{$_}->{product} =~ /(SSU|small subunit) ribosomal RNA/i){
		push(@{$rRNA{'SSU'}}, $_);
	    }elsif($this->{$_}->{product} =~ /(LSU|large subunit) ribosomal RNA/i){
		push(@{$rRNA{'LSU'}}, $_);
	    }
	}

	if(defined $rRNA{$option}){
	    my @result = sort {length($this->get_geneseq($b))<=>length($this->get_geneseq($a))} @{$rRNA{$option}};
	    return wantarray ? @result : $result[0];
	}
    }else{
	return feature($this, 'rRNA', $option);
    }
}


=head2 $gb->tRNA()

   Name: $gb->tRNA()   -   get a list of feature IDs of tRNAs

   Description:
         Returns the array of all feature IDs of tRNAs.

   REST: 
      http://rest.g-language.org/NC_000913/tRNA


=cut

sub tRNA {
    return feature($_[0], 'tRNA', $_[1]);
}


=head2 $gb->gene()

   Name: $gb->gene()   -   get a list of feature IDs of genes

   Description:
         Returns the array of all feature IDs of genes.

   REST: 
      http://rest.g-language.org/NC_000913/gene


=cut

sub gene {
    return feature($_[0], 'gene', $_[1]);
}

=head2 $gb->cds()

   Name: $gb->cds()   -   get a list of CDS IDs

   Description:
         Returns the array of all feature IDs of CDS.
         Features are ignored when $gb->{FEATURE$i}->{on} OR
         $gb->{CDS$i}->{on} is 0.

         !CAUTION! the object name is actually the FEATURE ID,
         to enable access to all feature values. However, most of the
         time you do not need to be aware of this difference.

         eg.
           foreach ($gb->cds()){
               $gb->get_geneseq($_);
           }
           #prints all gene sequences.

         Option of "all" always returns all features regardless of the
         value of $gb->{$feature}->{on}.

   REST: 
      http://rest.g-language.org/NC_000913/cds


=cut

sub cds {
    return feature($_[0], 'CDS', $_[1]);
}


=head2 $gb->next_feature()

   Name: $gb->next_feature()   -   get the next feature ID

   Description:
         Given a feature ID, returns the ID of the next feature.
         Second argument can be used to specify the type of the 
         next feature.

         eg. $gb->next_feature(FEATURE1234); # returns 'FEATURE1235'
             $gb->next_feature(FEATURE1234, 'tRNA'); 
             # returns next feature ID whose type is 'tRNA'

   REST: 
      http://rest.g-language.org/NC_000913/next_feature/recA


=cut

sub next_feature{
    my $this = shift;
    my $feature = shift || 'FEATURE0';
    my $opt = shift;

    my @features = $this->feature($opt);
    return undef if ($feature eq $features[-1]);

    $feature = $this->{$feature}->{left} if ($feature =~ /^INTER/);

    my $i = $this->{$feature}->{feature};
    $i ++;

    while(%{$this->{"FEATURE$i"}}){
	my $feat = "FEATURE$i";
	$i ++;

	if(length($opt)){
	    next unless($this->{$feat}->{type} eq $opt);
	}

	return $feat;
    }
}


=head2 $gb->previous_feature()

   Name: $gb->previous_feature()   -   get the previous feature ID

   Description:
         Given a feature ID, returns the ID of the previous feature.
         Second argument can be used to specify the type of the 
         next feature.

         eg. $gb->previous_feature(FEATURE1234); # returns 'FEATURE1233'
             $gb->previous_feature(FEATURE1234, 'tRNA'); 
             # returns previous feature ID whose type is 'tRNA'

   REST: 
      http://rest.g-language.org/NC_000913/previous_feature/recA

=cut


sub previous_feature{
    my $this = shift;
    my $feature = shift || 'FEATURE0';
    my $opt = shift;

    return undef if ($feature eq 'FEATURE0');
    $feature = $this->{$feature}->{right} if ($feature =~ /^INTER/);

    my $i = $this->{$feature}->{feature};
    $i --;

    while(%{$this->{"FEATURE$i"}}){
	my $feat = "FEATURE$i";
	$i --;

	if(length($opt)){
	    next unless($this->{$feat}->{type} eq $opt);
	}

	return $feat;
    }
}

=head2 $gb->next_cds()

   Name: $gb->next_cds()   -   get the feature ID of next CDS

   Description:
         Given a feature ID, returns the ID of the next cds.
         This is same as $gb->next_feature($featureID, 'CDS');

   REST: 
      http://rest.g-language.org/NC_000913/next_cds/recA


=cut

sub next_cds{
    return next_feature($_[0], $_[1], 'CDS');
}

=head2 $gb->previous_cds()

   Name: $gb->previous_cds()   -   get the feature ID of previous CDS

   Description:
         Given a feature ID, returns the ID of the previous cds.
         This is same as $gb->previous_feature($featureID, 'CDS');

   REST: 
      http://rest.g-language.org/NC_000913/previous_cds/recA


=cut

sub previous_cds{
    return previous_feature($_[0], $_[1], 'CDS');
}

=head2 $gb->intergenic()

   Name: $gb->intergenic()   -   get a list of IDs of intergenic regions

   Description:
         Returns the array of all IDs of intergenic regions. 
         Here "intergenic region" is defined as the region in a genome
         between coding and stable RNA genes. 

   REST: 
      http://rest.g-language.org/NC_000913/intergenic

=cut

sub intergenic {
    my $this = shift;
    my $opt = shift || '';
    my $i = 0;
    my @cds;

    set_intergenic($this) unless(defined($this->{INTER1}));

    while($this->{"INTER" . ($i + 1)}){
	$i ++;

	if ($opt ne 'all'){
	    next if ($this->{"INTER$i"}->{on} == 0);
	}
	
	push (@cds, "INTER$i");
    }

    return @cds;
}
	  

sub set_intergenic{
    my $gb = shift;
    return if(defined($gb->{INTER1}));

    my $num = 1;
    my $i = 0;
    my $interface = msg_ask_interface();
    msg_interface("NULL");
    my @cds = $gb->find(-type=>"CDS|RNA");

    foreach my $feature (@cds){
	if($i == 0 && $gb->{$feature}->{start} < $gb->{$feature}->{end}){
	    $gb->{"INTER$num"}->{start} = 1;
	    $gb->{"INTER$num"}->{end} = $gb->{$feature}->{start} - 1;
	    $gb->{"INTER$num"}->{direction} = "direct";
	    $gb->{"INTER$num"}->{left} = undef;
	    $gb->{"INTER$num"}->{right} = $feature;
	    $gb->{"INTER$num"}->{on} = 1;
	    $num++;
	}elsif($i == $#cds && $gb->{$cds[-1]}->{end} + 1 < $gb->{SEQ}){
	    $gb->{"INTER$num"}->{start} = $gb->{$cds[-1]}->{end} + 1;
	    $gb->{"INTER$num"}->{end} = length $gb->{SEQ};
	    $gb->{"INTER$num"}->{direction} = "direct";
	    $gb->{"INTER$num"}->{left} = $feature;
	    $gb->{"INTER$num"}->{right} = undef;
	    $gb->{"INTER$num"}->{on} = 1;
	    $num++;
	}elsif($i > 0){
	    if($gb->{$cds[$i - 1]}->{end} < $gb->{$feature}->{start}){
		$gb->{"INTER$num"}->{start} = $gb->{$cds[$i - 1]}->{end} + 1;
		$gb->{"INTER$num"}->{end} = $gb->{$feature}->{start} - 1;
		$gb->{"INTER$num"}->{direction} = "direct";
		$gb->{"INTER$num"}->{left} = $cds[$i - 1];
		$gb->{"INTER$num"}->{right} = $feature;
		$gb->{"INTER$num"}->{on} = 1;
		$num++;
	    }
	}
	$i++;
    }

    msg_interface($interface);
}

=head2 $gb->id()

   Name: $gb->id()   -   get the sequence ID from G instance

   Description:
         Returns the sequence ID. Same as $gb->{LOCUS}->{id};

   REST: 
      http://rest.g-language.org/ecoli/id

=cut

sub id{
    return $_[0]->{LOCUS}->{id};
}

=head2 $gb->seq()

   Name: $gb->seq()   -   get the sequence data from G instance

   Description:
         Returns the entire sequence. Same as $gb->{SEQ};

   REST: 
      http://rest.g-language.org/ecoli/seq

=cut

sub seq{
    return $_[0]->{SEQ};
}



sub hairpin_cut {
    system('firefox http://www.toychan.net/afro/');
    return "\n==============\n!!!Afro Man!!!===============\n\n";
}





=head2 $gb->del_key()

   Name: $gb->del_key()   -   delete a data object from G instance

   Description:
         Given a object, deletes it from the G instance structure
         eg. $gb->del_key('FEATURE1'); # deletes 'FEATURE1' hash

   REST: 
      http://rest.g-language.org/NC_000913/del_key/FEATURE1

=cut


sub del_key {
    $_[0]->{$_[1]}->{on} = 0;
    return 1;
}


=head2 $gb->relocate_origin()

   Name: $gb->relocate_origin()   -   create a G instance starting at given position

   Description:
         Returns a G instance starting at given position, assuming circular
         chromosome. All information, including the sequence and feature 
         annotations are moved. Note that the given position is Perl position
         and NOT GenBank position. GenBank position -1 equals Perl position.

   Usage:
      $new = $gb->relocate_origin($position);

      This method would probably be most useful in conjunction with
      find_ori_ter(), to create a G instance starting from the 
      origin of replication, as follows:

        ($ori, $ter) = find_ori_ter($gb);
        $new = $gb->relocate_origin($ori);

      Several of related methods can be concatenated. For example, 
      to create a GenBank file of complementary DNA strand starting
      from the origin of replication, do the following:

        $gb->reverse_strand()->relocate_origin($ori)->output("out.gbk");

   REST: 
      http://rest.g-language.org/NC_000913/relocate_origin


=cut

sub relocate_origin{
    require G::IO;
    my $this = new G::IO("blessed");
    my $tmp = shift;
    my $gb = $tmp->clone();
    my $pos = shift;
    croak("New start position \(in Perl coordinate\) must be given.\n") unless($pos =~ /\d/);

    $this->{LOCUS} = $gb->{LOCUS};
    $this->{HEADER} = $gb->{HEADER};
    $this->{COMMENT} = $gb->{COMMENT};
    $this->{SEQ} = substr($gb->{SEQ}, $pos) . substr($gb->{SEQ}, 0, $pos);
    $this->{FEATURE0} = $gb->{FEATURE0};

    my (@before, @after);
    my @features = $gb->feature();
    shift @features;
    foreach my $feature (@features){
	if($gb->{$feature}->{start} >= $pos + 1){
	    push(@after, $feature);
	}else{
	    push(@before, $feature);
	}
    }

    my $f = 0;
    my $c = 0;
    foreach my $feature (@after, @before){
	$f ++;
	$this->{"FEATURE$f"} = $gb->{$feature};
	$this->{"FEATURE$f"}->{feature} = $f;
	if($gb->{$feature}->{type} eq 'CDS'){
	    $c ++;
	    $this->{"FEATURE$f"}->{cds} = $c;
	}

	if($gb->{$feature}->{end} >= $pos + 1 && $gb->{$feature}->{start} < $pos + 1){
	    $this->{"FEATURE$f"}->{start} += length($gb->{SEQ}) - $pos;
	    $this->{"FEATURE$f"}->{end} -= $pos;

	    if(length $this->{"FEATURE$f"}->{join}){
		msg_error("Warning: overriding join definition for FEATURE$f.\n");
		msg_error("  this is likely to destroy positional features of this gene entry.\n");
	    }

	    $this->{"FEATURE$f"}->{join} = sprintf("%d\.\.%d,%d\.\.%d", $this->{"FEATURE$f"}->{start},
						   length($gb->{SEQ}), 1, $this->{"FEATURE$f"}->{end});
	    next;
	}

	my $lng = length($gb->{SEQ}) - $pos;
	if($gb->{$feature}->{start} >= $pos + 1){
	    $lng = -$pos;
	}

	$this->{"FEATURE$f"}->{start} += $lng;
	$this->{"FEATURE$f"}->{end} += $lng;

	if(defined $this->{"FEATURE$f"}->{"join"}){
	    my @join = split(/\,/,$this->{"FEATURE$f"}->{"join"});
	    my @num = ();
	    my @new_join = ();

	    foreach(@join){
		if(tr/c/c/){
		    @num = split(/\.\./,$_);
		    push (@new_join, sprintf ("c%d\.\.%d", $num[0] + $lng, $num[1] + $lng));
		} else {
		    @num = split(/\.\./,$_);
		    push (@new_join, sprintf ("%d\.\.%d",  $num[0] + $lng, $num[1] + $lng));
		}
	    }
	    $this->{"FEATURE$f"}->{join} = join(',', @new_join);
	}
    }
    
    $this->set_gene_aliases();

    return $this;
}

=head2 $gb->reverse_strand()

   Name: $gb->reverse_strand()   -   create a G instance on complementary DNA strand

   Description:
         Returns a G instance for the complementary DNA strand. 
         All information, including the sequence and feature annotations
         is switched to reflect that of the complementary DNA strand. 
         In other words, gene order, direction of genes (either direct or 
         complement), and positions are reversed. 

   Usage: 
      $new = $gb->reverse_strand();

   REST: 
      http://rest.g-language.org/NC_000913/reverse_strand

=cut


sub reverse_strand{
    require G::IO;
    my $this = new G::IO("blessed");
    my $tmpG = shift;
    my $gb = $tmpG->clone();

    $this->{LOCUS} = $gb->{LOCUS};
    $this->{HEADER} = $gb->{HEADER};
    $this->{COMMENT} = $gb->{COMMENT};
    $this->{SEQ} = complement($gb->{SEQ});
    $this->{FEATURE0} = $gb->{FEATURE0};

    my @feat = $gb->feature();
    shift @feat;
    my (@features, @tmp);
    foreach my $feature (@feat){
	if($gb->{$feature}->{type} eq 'gene'){
	    unshift(@features, @tmp);
	    @tmp = ($feature);
	}else{
	    push(@tmp, $feature);
	}
    }
    unshift(@features, @tmp);

    my $f = 0;
    my $c = 0;
    my $lng = length($gb->{SEQ}) + 1;
    foreach my $feature (@features){
	$f ++;
	$this->{"FEATURE$f"} = $gb->{$feature};
	$this->{"FEATURE$f"}->{feature} = $f;

	if($gb->{$feature}->{type} eq 'CDS'){
	    $c ++;
	    $this->{"FEATURE$f"}->{cds} = $c;
	}

	my ($start, $end) = ($lng - $gb->{$feature}->{end}, $lng - $gb->{$feature}->{start});
	$this->{"FEATURE$f"}->{start} = $start;
	$this->{"FEATURE$f"}->{end}   = $end;
	$this->{"FEATURE$f"}->{direction} = $gb->{$feature}->{direction} eq 'direct' ? 'complement' : 'direct';

	if(defined $this->{"FEATURE$f"}->{"join"}){
	    my @join = split(/\,/,$this->{"FEATURE$f"}->{"join"});
	    my @num = ();
	    my @new_join = ();

	    foreach(@join){
		if(tr/c/c/){
		    @num = split(/\.\./,$_);
		    push (@new_join, sprintf ("c%d\.\.%d", $lng - $num[1], $lng - $num[0]));
		} else {
		    @num = split(/\.\./,$_);
		    push (@new_join, sprintf ("%d\.\.%d",  $lng - $num[1], $lng - $num[0]));
		}
	    }
	    $this->{"FEATURE$f"}->{join} = join(',', reverse(@new_join));
	}
    }
    $this->set_gene_aliases();

    return $this;
}


sub _interpret_format {
    my $filename = shift;
    my $ref = ref $filename;

    if ($filename =~ /^(.*?):(.*)/){
	unless(lc($1) =~ /(swall|swiss|sw|uniprot|genbank|genpept|embl|refseq)/){
	    croak("Unsupported database name for USA. Supported databases are\n" .
		"swiss, sw, uniprot, genbank, genpept, embl, or refseq\n");
	}
	return "usa";
    }elsif (lc($filename) =~ /\.(gb|gbk|gbank|genbank|nuccore)$/i){
	return 'genbank';
    }elsif (lc($filename) =~ /\.(fasta|fast|seq|fst|fa|fsa|nt|aa|ffn|fna)/i){
	return 'fasta';
    }elsif (lc($filename) =~ /\.(fastq|fq)/i){
	return 'fastq';
    }elsif (lc($filename) =~ /\.(embl)/i){
	return 'embl';
    }elsif ($filename =~ /^NC_\d+$/i ||
	    $filename =~ /^NP_\d+$/i ||
	    $filename =~ /^(?:[A-Z]|[A-Z][A-Z]).\d+$/i ||
	    $filename =~ /^NC_\d+\.\d+$/i ||
	    $filename =~ /^NP_\d+\.\d+$/i ||
	    $filename =~ /^(?:[A-Z]|[A-Z][A-Z]).\d+\.\d+$/i
	    ){
	return 'RefSeq';
    }elsif(-e $filename){
	my @tmp = split(/\./, $filename);
	my $format = $filename =~ /\./ ? pop(@tmp) : '';

	if (length $format){
	    return $format;
	}else{
	    open(FILE, $filename);
	    while(<FILE>){
		if(/^>/){
		    return 'fasta';
		}elsif(/^LOCUS/){
		    return 'genbank';
		}elsif(/^ID/){
		    return 'embl';
		}else{
		    carp("Unknown file format. Interpreting $filename as GenBank...\n");
		    return 'genbank';
		}
		last;
	    }
	    close(FILE);
	}
    }else{
	my $id = readFile("http://rest.g-language.org/gbp/orgname.cgi?id=$filename", 1);
	return ("named:$id") if(length($id));
    }

}

=head2 $gb->output()

   Name: $gb->output()   -   output the G instance data to file

   Description:
         Given a filename and an option, outputs the G-language data object 
         to the specified file in a flat-file database of a given format.
         The options are the same as those of new().  Default format is 'GenBank'.

         eg. $gb->output("my_genome.embl", "EMBL");
             $gb->output("my_genome.gbk"); # with GenBank you can ommit the option.
             $gb->output("my_genome.gff3");

   REST: 
      http://rest.g-language.org/NC_000913/output/embl (convert input genome to EMBL format)


=cut

sub output {
    my $gb = shift;
    my $file = shift;
    my $option = shift;

    $option = _interpret_format($file) unless(length $option);

    if (lc($option) eq 'genbank'){
	$gb->make_gb($file);
    }elsif(length $option){
	my $in;

	my ($fh, $outfile) = tempfile(undef, SUFFIX=>'.gbk');
	$gb->make_gb($outfile);

	require LWP::UserAgent;
	my $ua = LWP::UserAgent->new;
            my $response = $ua->post("http://rest.g-language.org/emboss/",
                                     Content_Type=>'multipart/form-data',
                                     Content=>[file1 => [$outfile], 'arg'=>"seqret/osformat2=$option/-feature"]
                                     );

	if ($response->is_success) {
	    writeFile($response->decoded_content, $file);
	} else {
	    die("Error in output:\n" . $response->status_line . "\n File format \"$option\"of \"$file\" is not supported.");
	}

	unlink($outfile);
    }else{
	&msg_error("G::output - Unknown format to output.");
    }
}




=head2 $gb->set_generationtime()

   Name: $gb->set_generationtime()   -   add generation time data for bacterial genomes

   Description:
         This method adds generation time data for bacterial genomes,
         based on the data in Reference 1.

         $gb->{FEATURE0}->{generationTime} is the generation time (hour), and
         $gb->{feature0}->{optimumGrowthTemperature} is the optimum growth temperature in Celsius.

  References: 
    1. Vieira-Silva S and Rocha RPC (2010) "The systemic imprint of growth and its uses in 
       ecological (meta)genomics", PLoS Genetics, 6(1):e1000808

  REST: 
      http://rest.g-language.org/NC_000913/set_generationtime

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  HISTORY:
     20110224-01 initial posting

=cut


sub set_generationtime {
    my $gb = shift;

    my $filename = dist_file('g-language', 'data/generationtime.dat');
    for my $line (readFile($filename, 1)){
	next unless($line =~ $gb->{LOCUS}->{id});

	my @F = split(/\t/, $line);
	$gb->{FEATURE0}->{'generationTime'} = $F[3];
	$gb->{FEATURE0}->{'optimumGrowthTemperature'} = $F[2];
    }

    unless(length($gb->{FEATURE0}->{'generationTime'})){
	msg_error("No generation time data for this species.\n\n");
    }

    return 1;
}

=head2 $gb->set_essentiality()

   Name: $gb->set_essentiality()   -   add gene essentiality data for E.coli genome 

   Description:
         This method adds gene essentiality data for E.coli genome,
         based on the data in PEC database (http://www.shigen.nig.ac.jp/ecoli/pec/index.jsp)
         Only works for U00096 or NC_000913 E.coli K-12 MG1655 flatfile.

         $gb->{$cds}->{essentiality} is 1 for essential, -1 for non-essential genes.

  References:
    1. Hashimoto M et al. (2005) "Cell size and nucleoid organization of engineered 
       Escherichia coli cells with a reduced genome", Mol Microbiol, 55(1):137-149

  REST: 
      http://rest.g-language.org/NC_000913/set_essentiality

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  HISTORY:
     20100123-01 bundled the data in distribution, moved the function to G::IO::Handler
     20070219-01 initial posting

=cut

sub set_essentiality {
    my $gb = shift;

    if ($gb->{LOCUS}->{id} eq 'U00096' || $gb->{LOCUS}->{id} eq 'NC_000913'){

        my $filename = dist_file('g-language', 'data/PECData.dat');
	my $i = 1;
	open(FILE, $filename) || die($!);
	while(<FILE>){
	    chomp;
	    my @hoge = split(/\t/);
	    my @list = ($hoge[0], split(/,/, $hoge[1]));

	    my $id;
	    foreach my $gene (@list){
		$id = $gb->gene2id($gene);
		last if(length $id);
	    }
	    
	    if($hoge[7] == 1){
		$gb->{$id}->{essentiality} = 1;		
	    }elsif($hoge[7] == 2){
		$gb->{$id}->{essentiality} = -1;
	    }
	}
	close(FILE);
   }else{
       msg_error("No essentiality data for this species.\n\n");
   }

    return 1;
}

1;



