#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2014 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: G::IO.pm,v 1.1 2002/07/30 17:44:27 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package G::IO;

use strict;
use base qw(G::IO::Handler G::IO::GenBankI G::IO::GenBankO G::DB::Handler);

use G::Messenger;
use G::DB::SDB;
use G::IO::Annotation;

require G::IO::FastaI;
require G::IO::FastQI;
require G::IO::EmblI;

use Carp;
use LWP::Simple;
use File::Basename;
use File::ShareDir ':ALL';
use autouse 'File::Temp'=>qw(tempfile);

our $VERSION = '1.9.0';

#::::::::::::::::::::::::::::::
#          Perldoc
#::::::::::::::::::::::::::::::

=head1 NAME

G::IO - G-language System v.2 core class

=head1 SYNOPSIS

 use G::IO;
 use base qw(G::IO);
   
=head1 DESCRIPTION

 Inherits all necessary classes.
 Intended for internal use only.
 This is the central core module of G-language GAE.

=head1 AUTHOR

Kazuharu Arakawa, gaou@sfc.keio.ac.jp

=cut

#::::::::::::::::::::::::::::::
#          Variables
#::::::::::::::::::::::::::::::

my $loaded = 0;
my @instances;

#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::

sub new {
    my $pkg = shift;
    my $filename = shift;

    if(length($filename) < 1){
	$filename = dist_file('g-language', 'genomes/ecoli.gbk');
	msg_error("WARNING: no genome file specified. Loading Escherichia coli K12 MG1655 genome as a sample.\n");
    }elsif(
	   lc($filename) eq 'ecoli' || 
	   lc($filename) eq 'bsub'  || 
	   lc($filename) eq 'pyro'  || 
	   lc($filename) eq 'mgen'  || 
	   lc($filename) eq 'cyano' ||
	   lc($filename) eq 'bbur'  ||
	   lc($filename) eq 'lambda'  ||
	   lc($filename) eq 'plasmidf'
	   ){
	$filename = dist_file('g-language', "genomes/$filename.gbk");
    }

    my @options = @_;
    my $this = {};
    my $tmp = {};

    $filename =~ s/\~\//sprintf("%s\/",$ENV{HOME})/e;
    
    bless $this;

    my $no_msg                 = 0;
    my $multiple_locus         = 0;
    my $multiple_locus_len     = 0;
    my $longest_ORF_annotation = 0;
    my $glimmer_annotation     = 0;
    my $no_cache               = 0;
    my $force_cache            = 0;
    my $gzip                   = 0;
    my $format                 = '';
    my $locus_msg              = '';

    return $this if ($filename eq 'blessed');

    $gzip = $filename =~ s/\.gz$//;
    
    foreach my $opt_tmp (@options){
	next if(length($opt_tmp) < 1);

	my $opt = lc($opt_tmp);

	if ($opt =~ /no msg/){
	    $no_msg = 1;
	}elsif ($opt =~ /multiple locus\D*(\d*)/){
	    $multiple_locus = 1;
	    $multiple_locus_len = $1;
#	    msg_error("Multiple locus len: $multiple_locus_len");
	    $no_cache = 1;
	}elsif ($opt =~ /longest orf/){
	    $longest_ORF_annotation = 1;
	}elsif ($opt =~ /glimmer annotation/){
	    $glimmer_annotation = 1;
	}elsif ($opt =~ /no cache/){
	    $no_cache = 1;
	}elsif ($opt =~ /force cache/){
	    $force_cache = 1;
	    $no_cache = 1;
	}elsif (   $opt eq 'fasta' || $opt eq 'embl' 
		|| $opt eq 'swiss' || $opt eq 'scf' 
		|| $opt eq 'pir'   || $opt eq 'gcg' 
		|| $opt eq 'raw'   || $opt eq 'ace'
		|| $opt eq 'game'  || $opt eq 'phd'
		|| $opt eq 'qual'  || $opt eq 'bsml' 
		|| $opt eq 'fastq' || $opt eq 'genbank' ){
	    $format = $opt;
	}else{
#	    warn("Warning: Unknown Option $opt at \"new G\"\n");
	}
    }
    
    $this->loaded_msg() unless ($no_msg);

    $format = G::IO::Handler::_interpret_format($filename) unless(length $format);

    if($format =~ /^named:(.*)$/){
	$format = 'usa';
	$filename = "genbank:$1";
    }elsif($format eq 'RefSeq'){
	$format = 'usa';
	$filename = "genbank:$filename";
    }elsif($format eq 'fastq' || $format eq 'fasta'){
	$no_cache = 1 unless($force_cache);
    }

    my @stat = stat($filename);
    my $basename = basename($filename) if (-e $filename);
    my $cachedFile = sprintf("%s/.glang/cache-%s-%s-%s", $ENV{HOME}, $basename, $stat[7], $stat[9]);

    if (-e $cachedFile && $no_cache == 0){
	$this = sdb_load(sprintf("cache-%s-%s-%s", $basename, $stat[7], $stat[9]));
	$this->seq_info() unless ($no_msg);
	my $now = time();
	utime $now, $now, $cachedFile;
    }elsif ($format eq 'usa'){
	my ($dbname, $entryname) = split(/:/, $filename, 2);

	if($dbname eq 'swiss' || $dbname eq 'sw' || $dbname eq 'uniprot' || $dbname eq 'swissprot' || $dbname eq 'swall'){
	    $dbname = 'uniprot';
	}

	if($dbname eq 'refseq' || $dbname eq 'genbank'){
	    $dbname = 'nuccore';
	}

	msg_error("Retrieving sequence $entryname from " . uc($dbname) . "...\n");

	if($dbname eq 'nuccore' || $dbname eq 'embl'){
	    my $path = _sdb_path() . $entryname . '.' . $dbname;
	    mirror('http://togows.dbcls.jp/entry/' . $dbname . '/'. $entryname, $path);
	    if($no_msg){
		$this = new G::IO($path, 'no msg');
	    }else{
		$this = new G::IO($path);
	    }
	    croak('Sequence not found in database at http://togows.dbcls.jp/entry/' . $dbname . '/'. $entryname) unless(length($this->{SEQ}));
	}elsif($dbname eq 'uniprot'){
	    my $path = _sdb_path() . $entryname . '.' . $dbname;
	    mirror('http://togows.dbcls.jp/entry/' . $dbname . '/'. $entryname, $path);
	    if($no_msg){
		$this = new G::IO($path, 'no msg', 'swiss');
	    }else{
		$this = new G::IO($path, 'swiss');
	    }
	    croak('Sequence not found in database at http://togows.dbcls.jp/entry/' . $dbname . '/'. $entryname) unless(length($this->{SEQ}));
	}else{
	    die("Unsupported USA: $filename\n");
	}
    }else{

	if ($format eq 'embl'){
	    $this = new G::IO::EmblI;
	}elsif ($format eq 'fasta'){
	    $this = new G::IO::FastaI;
	}elsif ($format eq 'fastq'){
	    $this = new G::IO::FastQI;
	}elsif ($format ne 'genbank'){

	    my($fh, $outfile) = tempfile(undef, SUFFIX=>'.gbk');

	    require LWP::UserAgent;
	    my $ua = LWP::UserAgent->new;
	    my $response = $ua->post("http://rest.g-language.org/emboss/",
				     Content_Type=>'multipart/form-data',
				     Content=>[file1 => [$filename], 'arg'=>'seqret/osformat2=genbank/-feature']
				     );

	    if ($response->is_success) {
		print $fh $response->decoded_content;
	    } else {
		die $response->status_line;
	    }

	    $filename = $outfile;
	    $locus_msg = 'no msg';
	    $format = 'genbank';
	}

	my $fh = $this->open_gb($filename . ($gzip ? '.gz' : ''));

	if ($multiple_locus){
	    my $tmp = $this->clone();
	    undef(%$this);

	    multi_locus($this, $tmp, $multiple_locus_len);
	    
	    $this->set_gene_aliases();
	    $this->seq_info() unless($no_msg);
	}else{
	    $this->next_locus($no_msg);

	    if($force_cache || $no_cache == 0){
		sdb_save($this, sprintf("cache-%s-%s-%s", $basename, $stat[7], $stat[9]));
	    }
	}
    }

    if ($longest_ORF_annotation){
	my $new = new G::IO("blessed");
	bless $new;
	annotate_with_LORF($new, $this);
	return $new;
    }elsif ($glimmer_annotation){
	my $new = new G::IO("blessed");
	bless $new;
	open(FASTA, '>/tmp/out.fasta') || die ($!);
	printf FASTA ">%s\n%s\n", $this->{LOCUS}->{id}, $this->{SEQ};
	close(FASTA);
	run_glimmer($this, '/tmp/out.fasta');
	annotate_with_glimmer($new, '/tmp/out.fasta');
	unlink('/tmp/out.fasta');
	return $new;
    }

    shift @instances;
    push(@instances, $this);

    return $this;
}

sub lastInstance {
    return $instances[0];
}

sub loaded_msg {
    my $this = shift;

    $loaded ++;
    return if ($loaded > 1);

    my $print =
	qq(
	     __/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/
                
                   G-language  Genome Analysis Environment v.$VERSION

 
                             http://www.g-language.org/

              Please cite: 
                 Arakawa K. et al. (2003) Bioinformatics.
                 Arakawa K. et al. (2006) Journal of Pestice Science.
	         Arakawa K. et al. (2008) Genes, Genomes and Genomics.

 	      License: GNU General Public License
	      Copyright (C) 2001-2014 G-language Project
	      Institute for Advanced Biosciences, Keio University, JAPAN 

	     __/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/__/
	   \n);

    &msg_error($print);
}


sub multi_locus{
    my $this = shift;
    my $gb  = shift;
    my $len = shift || 0;
    my $lng;
    my $i = 0;
    my $f = 1;
    my $c = 1;
    
#    msg_error("Multiple locus Length Sub: $len");

    do{
        my $F = 1;
        $lng = length($this->{"SEQ"});
        $this->{"LOCUS$i"}          = $gb->{"LOCUS"};
	$this->{"LOCUS$i"}->{start} = $lng + 1;
	$this->{"LOCUS$i"}->{end}   = $lng + 1 + length($gb->{SEQ});
        $this->{"HEADER$i"}         = $gb->{"HEADER"};
        $this->{"COMMENT$i"}        = $gb->{"COMMENT"};

        while(exists($gb->{"FEATURE$F"})){
            $this->{"FEATURE$f"}              = $gb->{"FEATURE$F"};
            $this->{"FEATURE$f"}->{"start"}   = $gb->{"FEATURE$F"}->{"start"} + $lng;
            $this->{"FEATURE$f"}->{"end"}     = $gb->{"FEATURE$F"}->{"end"} + $lng;
            $this->{"FEATURE$f"}->{"locus"}   = $i;
	    $this->{"FEATURE$f"}->{"feature"} = $f;

	    if($gb->{"FEATURE$F"}->{"type"} eq "CDS"){
                if(defined $gb->{"FEATURE$F"}->{"join"}){
                    my @join = split(/\,/,$gb->{"FEATURE$F"}->{"join"});
		    my @num = ();
		    my @new_join = ();

                    foreach(@join){
                        if(tr/c/c/){
                            @num = split(/\.\./,$_);
                            push (@new_join, sprintf ("c%d\.\.%d", $num[0] + $lng, $num[1] + $lng));
                        } else {
                            @num = split(/\.\./,$_);
                            push (@new_join, sprintf ("%d\.\.%d",  $num[0] + $lng, $num[1] + $lng));
                        }
                    }
                    $this->{"FEATURE$f"}->{join} = join(',', @new_join);
                }
		$this->{"FEATURE$f"}->{"cds"} = $c;
                $c++;
            }
            $f++;
            $F++;
        }
        $this->{"SEQ"} .= $gb->{"SEQ"} . 'n' x $len;
        $i++;

    }while($gb->next_locus("no msg"));

    $this->{"FEATURE0"}->{"type"}  = "source";
    $this->{"FEATURE0"}->{"start"} = 1;
    $this->{"FEATURE0"}->{"end"}   = length($this->{"SEQ"});
    $this->{"LOCUS"}   = $this->{"LOCUS1"};
    $this->{"HEADER"}  = $this->{"HEADER1"};
    $this->{"COMMENT"} = $this->{"COMMENT1"};
}

1;

