#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2013 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Tandem.pm,v 1.1.1.1 2002/04/02 20:25:42 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::Tandem;

use SubOpt;
use G::Messenger;

use GD;

use strict;
use base qw(Exporter);
use SelfLoader;

our @EXPORT = qw(
		 find_tandem
		 foreach_tandem
);


__DATA__

#::::::::::::::::::::::::::::::
#        Perldoc
#::::::::::::::::::::::::::::::

=head1 NAME

  G::Seq::Tandem - Sequence analysis methods related to tandem repeats.

=head1 DESCRIPTION

           This class is a part of G-language Genome Analysis Environment,
           collecting basic sequence analysis methods.

=cut

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::


#find_tandem ver.20010705-01
#scripting by Koya Mori(s98982km@sfc.keio.ac.jp)
#This program finds tandem repeats from DNA sequence.
#(array Repeat)=&find_tandem(pointer GENOME,  int pattern_length,  boolean debug);
sub find_tandem{
    &opt_default(PatLength=>3,output=>"stdout",filename=>"tandem.csv");
    my @args=opt_get(@_);
    
#    my $gb=opt_as_gb(shift @args);
    my $gb=shift @args;
    my $PatLen=opt_val("PatLength");
    my $filename=opt_val("filename");
    my $output=opt_val("output");
    my $SaikiTrup=shift @args;
    my $ref_Genome;
    my $CountArrayLength;
    my $RepeatNumber;
    my $NextPattern;
    my $BeforePattern;
    my $RepeatEndPos;
    my $Half;
    my $pattern;
    my %PatRst;
    my @Repeat;
    my $i;
    

    if($SaikiTrup==''){
	$ref_Genome=\$gb->{SEQ};
	$Half=int($PatLen/2);
    }
    else{
	$ref_Genome=$gb;
	$Half=$PatLen-1;
    }


    if($SaikiTrup==1 && $PatLen<1){
	return 0;
    } 
    else{

	for($CountArrayLength=0;$CountArrayLength<=length($$ref_Genome)-$PatLen;$CountArrayLength++){
	    $RepeatNumber=1;
	    $pattern=substr($$ref_Genome,$CountArrayLength,$PatLen);
	    $NextPattern=substr($$ref_Genome,$CountArrayLength+$PatLen,$PatLen);

	    if($pattern eq $NextPattern || $SaikiTrup==1){
		if($SaikiTrup==''){
		    if($pattern=~/n/){
			$PatRst{$pattern}=1;
		    }
		    elsif($PatRst{$pattern}==''){
			$PatRst{$pattern}=find_tandem(\$pattern,-PatLength=>$Half,1);
		    } 
		}
		else{
		    $PatRst{$pattern}=find_tandem($ref_Genome,-PatLength=>$Half,1);
		} 

		if($PatRst{$pattern}==1 && $SaikiTrup==1){
		    last;
		}
		if($PatRst{$pattern}==0){
		    if($CountArrayLength-$PatLen>=0){
			$BeforePattern=substr($$ref_Genome,$CountArrayLength-$PatLen,$PatLen);
		    }
		    if($pattern ne $BeforePattern){
			while($pattern eq $NextPattern){
			    $RepeatNumber++;
			    $NextPattern=substr($$ref_Genome,$CountArrayLength+$PatLen*$RepeatNumber,$PatLen);
			}
		    }
		    if($RepeatNumber!=1){
			$RepeatEndPos=$CountArrayLength+$PatLen*$RepeatNumber;
			if($SaikiTrup==''){
			    $Repeat[$i]{pattern}=$pattern;
			    $Repeat[$i]{patlength}=$PatLen;
			    $Repeat[$i]{repnumber}=$RepeatNumber;
			    $Repeat[$i]{startpos}=$CountArrayLength+1;
			    $Repeat[$i]{endpos}=$RepeatEndPos;
			    $i++;
			}
			$CountArrayLength=$CountArrayLength+$PatLen*$RepeatNumber-1;
		    }
		}
	    }
	}

	if($SaikiTrup==''){
	    if($output eq "f"){
		_print_tandem(\@Repeat,-print=>"f",-filename=>$filename);
	    }
	    if($output eq "stdout"){
		_print_tandem(\@Repeat);
	    }
	    return \@Repeat;
	}
	elsif($RepeatNumber*$PatLen==length($$ref_Genome) || $PatRst{$pattern}==1){
	    return 1;
	}
	elsif($RepeatNumber*$PatLen!=length($$ref_Genome)){ 
	    return 0;
	}
    }
}


#foreach_tandem ver.20010629-01
#scripting by Koya Mori(mory@g-language.org)
#This program finds each length pattern repeat from sequens.
#(array Repeat)=&foreach_tandem(pointer GENOME,  int max_length,  int min_length,  boolean debug);
sub foreach_tandem{
    &opt_default(MaxLength=>3,MinLength=>2,output=>"stdout",filename=>"tandem.csv");    
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $MaxLen=opt_val("MaxLength");
    my $MinLen=opt_val("MinLength");
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my $d;
    my $c;
    my $q;
    my $Repeat;
    my @Result;
    my @Result_tmp;
    my $start;
    my $end;
    my $i;
    

    for($d=$MaxLen;$d>$MinLen-1;$d--){
	$Repeat=find_tandem($gb,-PatLength=>$d,-output=>"n");
	$q=0;
	$c=0;
	while(defined($$Repeat[$c])){
	    $start=$$Repeat[$c]{startpos};
	    $end=$$Repeat[$c]{endpos};
	    while(defined($Result[$q])){
		if($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$end){
		    $$Repeat[$c]{pattern}='0';
		    last;
		}
		elsif($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$start && $Result[$q+1]{startpos}<=$end && $Result[$q+1]{endpos}>=$end){
		    $i=0;
		    while($i*$$Repeat[$c]{patlength}+$start<$Result[$q]{endpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{startpos}=$$Repeat[$c]{startpos}+$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    $i=0;
		    while($end-$i*$$Repeat[$c]{patlength}>$Result[$q+1]{startpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{endpos}=$$Repeat[$c]{endpos}-$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}<=$start && $Result[$q]{endpos}>=$start && $Result[$q]{endpos}<$end){
		    $i=0;
		    while($i*$$Repeat[$c]{patlength}+$start<$Result[$q]{endpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{startpos}=$$Repeat[$c]{startpos}+$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}<=$end && $Result[$q]{endpos}>=$end && $Result[$q]{startpos}>$start){
		    $i=0;
		    while($end-$i*$$Repeat[$c]{patlength}>$Result[$q]{startpos}){
			$i++;
		    }
		    if($$Repeat[$c]{endpos}-$$Repeat[$c]{startpos}-$i*$$Repeat[$c]{patlength}>$$Repeat[$c]{patlength}){
			$$Repeat[$c]{repnumber}=$$Repeat[$c]{repnumber}-$i;
			$$Repeat[$c]{endpos}=$$Repeat[$c]{endpos}-$i*$$Repeat[$c]{patlength};
		    }
		    else{
			$$Repeat[$c]{pattern}='0';
		    }
		    last;
		}
		elsif($Result[$q]{startpos}>$end){
		    last;
		} 
		else{
		    $q++;
		}
	    }
	    if($$Repeat[$c]{pattern} ne '0'){
		@Result=(@Result,$$Repeat[$c]);
	    } 
	    $q--;
	    $c++;
	}
        @Result_tmp=sort{$a->{startpos} <=> $b->{startpos}}@Result;
	@Result=();
        @Result=@Result_tmp;
    }

    if($output eq "f"){
	_print_tandem(\@Result,-print=>"f",-filename=>$filename);
    }
    if($output eq "stdout"){
	_print_tandem(\@Result);
    } 

    return \@Result;
}


#print_tandem ver.20010629-01
#scripting by Koya Mori(mory@g-language.org)
#This program prints result of find_tandem.
#&print_tandem(pointer TANDEM,  boolean debug);
sub _print_tandem{
    &opt_default(print=>"n",filename=>"tandem.csv");
    my @args=opt_get(@_);

    my $Repeat=shift @args;
    my $print=opt_val("print");
    my $filename=opt_val("filename");
    my $i;


    if($print eq "f"){
	open(FILE,">>$filename");
	foreach(@$Repeat){
	    print FILE "$$Repeat[$i]{pattern},$$Repeat[$i]{patlength},$$Repeat[$i]{repnumber},$$Repeat[$i]{startpos},$$Repeat[$i]{endpos}\n";
	    $i++;
	}
	print FILE "\n\n";
	close(FILE);
    }
    else{
	foreach(@$Repeat){
	    &msg_send("$$Repeat[$i]{pattern}\t$$Repeat[$i]{patlength}\t$$Repeat[$i]{repnumber}\t$$Repeat[$i]{startpos}..$$Repeat[$i]{endpos}\n");
	    $i++;
	}
    }
}


1;
