#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: PatSearch.pm,v 1.3 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#

package G::Seq::PatSearch;

use SubOpt;
use G::Messenger;
use G::Tools::Graph;
use G::Seq::Primitive;

use strict;
use SelfLoader;
require Exporter;

our @ISA = qw(Exporter);

our @EXPORT = qw(
	     palindrome
	     oligomer_search
	     oligomer_counter
	     find_ter
	     find_dnaAbox
	     find_dif
	     baseParingTest
	     nucleotide_periodicity
);

=head1 NAME

  G::Seq::PatSearch - component of G-language Genome Analysis Environment

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting sequence analysis methods related to pattern searches
    for oligonucleotides.

=head1 AUTHOR
    
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

=head1 SYNOPSIS

=cut

#__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::

=head2 palindrome

 Name: palindrome   -   searches palindrome sequences

  Description:
    Searches palindrome sequences

 Usage: 
    palindrome(sequence); 

 Options:
    -shortest shortest palindrome to search (default:4)
    -loop     longest stem loop to allow (default: 0)
    -gtmatch  if 1, allows g-t match (default: 0)
    -output   "f" for file output
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20010829-01 initial posting

=cut




sub palindrome {
    &opt_default(gtmatch=>0, loop=>0, shortest=>4, -output=>"stdout", -filename=>"palindrome.csv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $length = int(opt_val("shortest") / 2);
    my $output = opt_val("output");
    my $filename = opt_val("filename");

    my %palindrome;

    my $i = $length - 1; 
    my ($len, $j, $k, $stem);

    if (opt_val("output") eq "f"){
	open(OUT, '>' . $filename) || &msg_error("G::Seq::PatSearch::palindrome() $! $filename");
	print OUT "Length, start, end, sequence\n";
    }

    while($i <= length($gb->{SEQ}) - 1 - $length - opt_val("loop")){
	$stem = opt_val("loop");

	while($stem >= 0){
	    $j = $i;
	    $k = $stem + 1 + $i;
	    $len = 0;
	    last if ($k > length($gb->{SEQ}) - 1);

	    while(&baseParingTest(substr($gb->{SEQ}, $j, 1), 
			       substr($gb->{SEQ}, $k, 1),
			       &opt_val("gtmatch")) 
		  )
	    {
		$j --;
		$k ++;
		last if ($j < 0 || $k > length($gb->{SEQ}) - 1);
		$len += 2;
	    }

	    if ($len >= opt_val("shortest")){
		&msg_send(sprintf("Length: %2d Position: %7d %7d Sequence: %s %s %s\n",
		$len, $j + 1, $k - 2, 
		substr($gb->{SEQ}, $j + 1, $len/2),
		substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
		substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2))) if ($output eq 'stdout');

		if ($output eq "f"){
		    printf OUT "%d,%d,%d,%s %s %s\n",
		    $len, $j + 1, $k - 2, 
		    substr($gb->{SEQ}, $j + 1, $len/2),
		    substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
		    substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2);
		}

		$palindrome{$j + 1} = sprintf("%s %s %s", 
					      substr($gb->{SEQ}, $j + 1, $len/2),
					      substr($gb->{SEQ}, $j + 1 + $len/2, $stem),
					      substr($gb->{SEQ}, $j + 1 + $len/2 + $stem, $len/2)
					      );
	    }

	    $stem --;
	}
	$i ++;
    }
    close(OUT) if ($output eq "f");

    return \%palindrome;
}





=head2 find_dif

 Name: find_dif   -   finds dif sequence (chromosome partitioning site recognized by XerCD)

  Description:
    Finds dif sequence (chromosome partitioning site recognized by XerCD) in both strands.
    dif is a 28bp sequence element recognized by XerCD located near the replication
    terminus used for chromosome dimer resolution by recombination.

    For E. coli,              5'-GGTGCGCATAATGTATATTATGTTAAAT-3', (Blakely and Sherratt, 1994)
    for Proteobacteria,       5'-RNTKCGCATAATGTATATTATGTTAAAT-3', (Hendrickson and Lawrence, 2007)
    for B. subtilis,          5'-ACTTCCTAGAATATATATTATGTAAACT-3', (Sciochetti et al., 2001)
    for Firmicute,            5'-ACTKYSTAKAATRTATATTATGTWAACT-3', (Hendrickson and Lawrence, 2007)
    for Actinobacteria,       5'-TTSRCCGATAATVNACATTATGTCAAGT-3'. (Hendrickson and Lawrence, 2007)
    
  Usage: 
    @position = find_dif($sequence)
    
  Options:
    -type    ecoli for E.coli dif (default)
             proteobacteria, bsub, firmicute, actinobacteria, for corresponding dif sequences.
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
    
  History:
    20060711-01 initial posting

=cut



sub find_dif{
    opt_default('output'=>'stdout', -type=>'ecoli');
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $output = opt_val("output");
    my $type = lc opt_val('ecoli');
    my $dif = 'ggtgcgcataatgtatattatgttaaat';
    $dif = 'rntkcgcataatgtatattatgttaaat' if ($type eq 'proteobacteria');
    $dif = 'acttcctagaatatatattatgtaaact' if ($type eq 'bsub');
    $dif = 'actkystakaatrtatattatgtwaact' if ($type eq 'firmicute');
    $dif = 'ttsrccgataatvnacattatgtccagt' if ($type eq 'actinobacteria');

    my %data = oligomer_search($gb, $dif, -return=>"both"), oligomer_search($gb, complement($dif), -return=>"both");

    if($output eq 'stdout'){
	foreach my $pos (sort keys %data){
	    msg_send(sprintf "%d %s\n", $pos, $data{$pos});
	}
    }

    return sort keys %data;
}




=head2 find_ter

 Name: find_ter   -   finds ter sequence (replication termination site)

  Description:
    Finds ter sequence (replication termination site, recognized by Ter protein) in both strands.

    For E. coli,              5'-AGNATGTTGTAAYKAA-3',              (Coskun-Ari and Hill, 1997)
    for B. subtilis,          5'-KMACTAANWNNWCTATGTACYAAATNTTC-3', (Wake, 1997)
    
  Usage: 
    @position = find_ter($sequence)
    
  Options:
    -type    ecoli for E.coli ter (default)
             bsub, for corresponding ter sequence.
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
    
  History:
    20071022-01 initial posting

=cut



sub find_ter{
    opt_default('output'=>'stdout', -type=>'ecoli');
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $output = opt_val("output");
    my $type = lc opt_val('ecoli');
    my $ter = 'agnatgttgtaaykaa';
    $ter = 'kmactaanwnnwctatgtacyaaatnttc' if ($type eq 'bsub');

    my %data = oligomer_search($gb, $ter, -return=>"both"), oligomer_search($gb, complement($ter), -return=>"both");

    if($output eq 'stdout'){
	foreach my $pos (sort keys %data){
	    msg_send(sprintf "%d %s\n", $pos, $data{$pos});
	}
    }

    return sort keys %data;
}




=head2 find_dnaAbox

 Name: find_dnaAbox   -   finds dnaA box in both strands

  Description:
    Finds dnaA box(TT A/T TNCACA) in both strands.
    
  Usage: 
    @positions = find_dnaAbox($genome)
    
  Options:
    -output    stdout to print data (default: stdout)
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
    
  History:
    20071022-01 updated the code to use oligomer_search()
    20021125-01 initial posting

=cut



sub find_dnaAbox {
    opt_default('output'=>'stdout');
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $output = opt_val("output");
    my $dnaAbox = "ttwtncaca";
    my %data = oligomer_search($gb, $dnaAbox, -return=>"both"), oligomer_search($gb, complement($dnaAbox), -return=>"both");

    if($output eq 'stdout'){
	foreach my $pos (sort keys %data){
	    msg_send(sprintf "%d %s\n", $pos, $data{$pos});
	}
    }

    return sort keys %data;
}



=head2 oligomer_counter

 Name: oligomer_counter   -   counts the number of given oligomers in a sequence

  Description:
    Counts the number of oligomers in a sequence (by windows optionally).
    Oligomer can be specified using degenerate nucleotide alphabet, or by 
    regular expressions.

  Usage: 
    $count = oligomer_counter($genome);

 Options:
    -window      int window size.
                 If specified, seeks oligomer in specified windows
                 Method returns an array of numbers at each windows
                 If not specified, seeks oligomer in the genome
                 Method returns the number of oligomers
    -output      "f" for file output, "g" for graph output
                 Only available when -window option is specified

  Author: 
    Kazuharu Arakawa
    -based on atg7.wind + gcwind [rsaito]

  History:
    20071022-01 oligomer can be now degenerate nucleotide code or regular expressions
    20010829-01 initial posting

=cut




sub oligomer_counter {
    opt_default("window"=>0);
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $seq = shift @args;
    my $window = opt_val("window");
    my $output = opt_val("output");
    $window = length($gb->{SEQ}) if($window <= 0);

    if (opt_val("window")){
	open(OUT, '>oligo_count.csv') || &msg_error($!) if ($output eq "f");

	my $i = 0;
	my @wincount = ();
	my @winnum = ();
	for ($i = 0; $i <= int(length($gb->{SEQ}) / $window); $i ++){
	    my $partial = substr($gb->{SEQ}, $i * $window, $window);
	    last if (length($partial) < $window);
	    my $count = 0;
	    if (length($seq) == 1 && $seq =~ /[atgc]/){
		$count = $partial =~ tr/a/a/ if ($seq eq 'a');
		$count = $partial =~ tr/t/t/ if ($seq eq 't');
		$count = $partial =~ tr/g/g/ if ($seq eq 'g');
		$count = $partial =~ tr/c/c/ if ($seq eq 'c');
	    }else{
		$count = scalar oligomer_search($partial, $seq);
	    }
	    push (@wincount, $count);
	    push (@winnum, $i * $window);
	    print OUT "%d,%d\n", $i*$window, $count if ($output eq "f");
	}
	close(OUT) if ($output eq "f");
	if ($output eq "g"){
	    _UniMultiGrapher(\@winnum, \@wincount, -x=>'window(bp)', 
			  -y=>'number of oligomer', 
			  -title=>'oligomer by window',
			  -outfile=>'oligo_count.png'
			  );
	}
	return (@wincount);
    }else{
	return scalar oligomer_search($gb, $seq);
    }
}


=head2 oligomer_search

 Name: oligomer_search   -   searches oligomers in given sequence

  Description:
    Searches for the given oligomer in given sequence. Oligomer can be 
    specified using degenerate nucleotide alphabet, or by regular expressions.
    Performance is optimized for fast searching.

    This method changes the returning value according to the given options.

  Usage:
    @positions = oligomer_search($genome, $oligomer);

    @oligomers = oligomer_search($genome, $oligomer, -return=>"oligo");

    %positions_to_oligomers = oligomer_search($genome, $oligomer, -return=>"both");

    ($number_direct, $number_complement, $number_total, $ratio_direct) = 
           oligomer_search($genome, $oligomer, -return=>"distribution");

    $oligomer can be degenerate nucleotide alphabet or regular expressions.
         ex:  "grtggngg" (degenerate code), or "g[ag]tgg[a-z]gg" (regular expression)
 
 Options:
    -return   "position" to return list of positions where oligomers are found (default),
              "oligo" to return list of oligomers found ordered by positions,
              "both" to return a hash with positions as keys and oligomers as values,
              "distribution" to return four values (see above) about the distribution 
               of given oligomer.

  Author:
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
   
  History:
    20071022-01 initial posting

=cut

sub oligomer_search{
    opt_default(return=>"position");
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $oligo = lc(shift @argv);
    my $return = opt_val("return");

    if($oligo !~ /[^atgc]/ && $return eq 'position'){
	my $start = 0;
	my @result;
	while(0 <= ($start = index($gb->{SEQ}, $oligo, $start + 1))){
	    push(@result, $start);
	}
	return @result;
    }elsif($return eq 'distribution'){
	my $direct = scalar oligomer_search($gb, $oligo);
	my $comp = scalar oligomer_search($gb, complement $oligo);
	return ($direct, $comp, $direct + $comp, $direct/($direct + $comp));
    }

    unless($oligo =~ /[^a-z]/){
        $oligo =~ s/r/[ag]/g;
        $oligo =~ s/k/[gt]/g;
        $oligo =~ s/s/[gc]/g;
        $oligo =~ s/y/[ct]/g;
        $oligo =~ s/m/[ac]/g;
        $oligo =~ s/w/[at]/g;
        $oligo =~ s/b/[gct]/g;
        $oligo =~ s/h/[act]/g;
        $oligo =~ s/n/[a-z]/g;
        $oligo =~ s/d/[agt]/g;
        $oligo =~ s/v/[acg]/g;
    }

    my @result;
    {
        no strict 'refs';

        while($gb->{SEQ} =~ m/($oligo)/g){
            if($return eq 'oligo'){
                push(@result, ${1});
	    }elsif($return eq 'both'){
	        push(@result, $-[1], ${1});
            }else{
	        push(@result, $-[1]);
            }
        }
    }

    return @result;
}





=head2 baseParingTest

 Name: baseParingTest   -   checks if the two bases forms a pair

  Description:
    Base pairing check. 1 if the two bases pair, and 0 if they do not pair.
    G-T match is also considered when third argument is given.

  Usage: 
    boolean $match = match_test(char $first, char $second, boolean $gtmatch);
    
  Options:
    none
    
  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
    
  History:
    20010829-01 initial posting

=cut



sub baseParingTest {
    my $first = lc(shift);
    my $second = lc(shift);
    my $gtmatch = shift;
    die("First two arguments must be single base (i.e. a, t, g, or c).\n")
	unless(length($first) == 1 && length($second) == 1);

    if ($first eq 'a' && $second eq 't' ||
	$first eq 't' && $second eq 'a' ||
	$first eq 'g' && $second eq 'c' ||
	$first eq 'c' && $second eq 'g' ||
	$first eq 't' && $second eq 'g' && $gtmatch ||
	$first eq 'g' && $second eq 't' && $gtmatch
	)
    {
	return 1;
    }else{
	return 0;
    }
}



=head2 nucleotide_periodicity

 Name: nucleotide_periodicity   -   checks the periodicity of certain oligonucleotides

  Description:
    Checks the periodicity of certain nucleotide (best known with AA dinucleotide)
    
  Usage: 
    array data = nucleotide_periodicity(sequence);
    
  Options:
    -nucleotide    nucleotide to search (default:aa)
    -window        window size to seek periodicity (default:50)
    -filename      output filename (default:aa_frequency.png)
    -output        "g" for graph file output only,
                   "show" for graph file output and display.
                   (default: show)
    
  ToDo:
    data output

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
    
  History:
    20070206-01 initial posting

=cut



sub nucleotide_periodicity {
    opt_default("nucleotide"=>"aa", "window"=>50, "filename"=>"aa_frequency.png", "output"=>"show");
    my @argv = opt_get(@_);
    my $gb = opt_as_gb(shift @argv);
    my $nuc = opt_val("nucleotide");
    my $window = opt_val("window");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my @data = ();
    $data[$_] = 0 for (0..($window - 1));

    my $start = -1;
    while(0 <= ($start = index($gb->{SEQ}, $nuc, $start + 1))){
	my $innerPos = -1;
	my $localSeq = substr($gb->{SEQ}, $start + length($nuc), $window);
	while(0 <= ($innerPos = index($localSeq, $nuc, $innerPos + 1))){
	    $data[$innerPos]++;
	}
    }

    _Unimultigrapher([0..($window - 1)], \@data, -filename=>$filename);
    msg_gimv("graph/$filename") if ($output eq 'show');

    return @data;
}



1;


