#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Operon.pm,v 1.2 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#


package G::Seq::Operon;

use SubOpt;
use G::Messenger;

use LWP::Simple;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
		 set_operon
);


__DATA__

=head1 NAME

G::Seq::Operon - Retrieves operon information for bacterial genoms

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to operons.

=cut


######################################
#         Main Methods               #
######################################


=head2 set_operon

  Name: set_operon   -   set operon information from RegulonDB/ODB

  Description:
    This program retrieves the operon information from 
      1. RegulonDB for Escherichia coli K12 chromosome
      2. DOOR for other bacteria and plasmids
    and annotation to the given genome data. 

    Two attributes are added to each CDS hash.
        $genome->{$cds}->{operon}
    contains the name of the operon to which the gene belongs, and 
        $genome->{$cds}->{operonN}
    contains the rank order of the gene within the operon.
        $genome->{$cds}->{operonEvidence}
    contains evidence of the operon information (available for E.coli only)

  Usage:
    set_operon($gb);

 Options:
   None.

  References:
   1. Gama-Castro S et al. (2008) "RegulonDB (version 6.0): gene regulation model 
      of Escherichia coli K-12 beyond transcription, active (experimental) annotated 
      promoters and Textpresso navigation.", Nucleic Acids Res. 1;36(Database issue):D120-4

   2. Mao F et al. (2009) "DOOR: a database for prokaryotic operons", 
      Nucleic Acids Res. 1;37(Database issue):D459-D463

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
   20100617-01 bug fix for format changes
   20090322-01 added support for DOOR (stopped using Operon Database)
   20090313-01 added support for other organisms using Operon Database
   20090313-02 modified to match latest version of RegulonDB format (6.0)
   20070829-01 patched to match latest version of RegulonDB formamt
               (patch by Hiroyuki Nakamura <t04632hn@sfc.keio.ac.jp>
   20061003-01 updated to use data from RegulonDB
   20020207-01 initial posting

=cut


sub set_operon {
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);

    if ($gb->{LOCUS}->{id} eq 'U00096' || $gb->{LOCUS}->{id} eq 'NC_000913'){

	my $url = "http://regulondb.ccg.unam.mx:80/data/OperonSet.txt";
	my $dir = $ENV{HOME} . '/.glang/data/OperonSet.txt';
	mirror($url, $dir);
	die("setOperon: cannot retrieve data from RegulonDB.") unless(-e $dir);

	my $flag = 0;
	open(FILE, $dir) || die($!);
	while (<FILE>) {
	    chomp;

	    if (/_______________________/) {
	      $flag++;
	      next;
	    }elsif(/^\#(.*)/) {
		msg_error($1, "\n") unless($flag);
		next;
	    }

	    my %geneOrder;
	    
	    my ($operon, $num, $direction, $genes, $evidence) = split(/\t/, $_, 5);
	    next unless($num >= 2);
	    
	    foreach my $genepair (split(/,/, $genes)){
		my ($gene, $locustag) = split(/\|/, $genepair, 2);
		my $cds = $gb->gene2id($locustag);
		
		$cds = $gb->gene2id($gene) unless(length $cds);
		
		if($cds){
		    $gb->{$cds}->{operon} = $operon;
		    $gb->{$cds}->{operonEvidence} = $evidence;
		    $geneOrder{$cds} = $gb->{$cds}->{start};
		}
	    }
	    
	    my $i = 1;
	    if($direction eq 'forward'){
		foreach my $cds (sort {$geneOrder{$a} <=> $geneOrder{$b}} keys %geneOrder){
		    $gb->{$cds}->{operonN} = $i;
		    $i ++;
		}
	    }else{
		foreach my $cds (sort {$geneOrder{$b} <=> $geneOrder{$a}} keys %geneOrder){
		    $gb->{$cds}->{operonN} = $i;
		    $i ++;
		}
	    }
	}
	close(FILE);

	foreach my $cds ($gb->cds()){
	    $gb->{$cds}->{operonN} = 0 unless(length $gb->{$cds}->{operon});
	}

    }else{
	my $url = 'http://csbl1.bmb.uga.edu/OperonDB/downloadNCoperon.php?NC_id=' . $gb->{LOCUS}->{id};
	my $dir = $ENV{HOME} . '/.glang/data/Operon' . $gb->{LOCUS}->{id} . '.txt';
	mirror($url, $dir);
	die("No Operon data for this species.\n\n") unless(-e $dir);

	my $data = {};
	open(FILE, $dir) || die($!);
	while (<FILE>) {
	    chomp;
	    my ($operonName, $gi, $gene, undef) = split(/\s+/, $_, 4);
	    push(@{$data->{$operonName}}, $gene);
	}
	close(FILE);

	foreach my $operonName (keys %{$data}){
	    my @list = @{$data->{$operonName}};
	    @list = reverse(@list) if($gb->{$list[0]}->{direction} eq 'complement');
	    
	    my $i = 1;
	    foreach my $cds (@list){
		$gb->{$cds}->{operon} = $operonName;
		$gb->{$cds}->{operonN} = $i;
		$i ++;
	    }
	}
	
	foreach my $cds ($gb->cds()){
	    $gb->{$cds}->{operonN} = 0 unless(length $gb->{$cds}->{operon});
	}
    }

    return $gb;
}





1;

