#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2016 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package Rcmd::Clustering;

use strict;
use autouse 'Carp'=>qw(croak);
use base qw(Exporter);

use SubOpt;
use G::Messenger;

our @EXPORT = qw(
		 hclust
		 som
		 kmeans
		 );

#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

  Rcmd::Clustering - Interfaces to clustering algorithms of R language.

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting interfaces to clustering algorithms of R language.

=cut


#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::

sub set_clust_data{
    my $rcmd = shift;

    my @args = opt_get(@_);
    my $label = opt_val("label") || '';

    my $flag = 0;
    foreach (@args){
	if ($flag == 0){
	    $flag ++;

	    $rcmd->array('rclust', @$_);
	}else{
	    $rcmd->array('tmp', @$_);
	    $rcmd->exec('rclust <- rbind(rclust, tmp)');
	}
    }

    if (length $label){
	$rcmd->sarray('label', @$label);
	$rcmd->exec('dimnames(rclust) <- list(label, NULL)');
    }
}

sub sample_data_for_clustering{
    my $rcmd = shift;

    $rcmd->exec(
		'rclust<-rbind(matrix(rnorm(100,sd=0.3),ncol=2),' .
		'matrix(rnorm(100,mean=1,sd=0.3),ncol=2))' 
		);
}


=head2 som()

  Name: som()   -   clustering using Self-Organizing Map

  Description:
    Clustering with Self-Organizing Map (SOM) using R language.
    Installation of GeneSOM library for R language is required.
        run R as a super user - sudo R - and type the following:
        install.packages('som')) 
    
    Returns a two-dimensional array correspondingn to the 
    result$visual of som() in R's GeneSOM library.

  Usage:
    @result = som(\@array1, \@array2, \@array3, ..., -label=>\@label);

    Arrays correspond to the columns (data series), and labels for each of
    these arrays can be given by -label option.

  Options:
   -label        labels or names of the data series.
   -xdim         x-dimension of the map (default: 3)
   -ydim         y-dimension of the map (default: 3)
   -filename     output filename of the graph (default: som.pdf)
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -sampledata   use sample data (default: 0)

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
   20141110-01 exported by default
   20070612-01 converted to Rcmd::Clustering
   20030904-01 initial posting (G::Tools::RCluster)

=cut


sub som{
    my $rcmd = new Rcmd();
    $rcmd->set_mode('tmp');

    opt_default(filename=>"som.pdf", output=>"show", xdim=>3, ydim=>3, sampledata=>0, topo=>'hexa', neigh=>'gaussian');
    my @args       = opt_get(@_);
    my $xdim       = opt_val("xdim");
    my $ydim       = opt_val("ydim");
    my $filename   = opt_val("filename");
    my $output     = opt_val("output");
    my $sampledata = opt_val("sampledata");
    my $label      = opt_val("label") || '';
    my $topo       = opt_val('topo');
    my $neigh      = opt_val('neigh');

    if($sampledata){
	$rcmd->sample_data_for_clustering();
    }else{
	$rcmd->set_clust_data(@args, -label=>$label);
    }

    $rcmd->exec(
		'require(som)',
		"rclust.som<-som(rclust, $xdim, $ydim, topo='hexa', neigh='gaussian')",
		'rclust.som$visual'
		);

    my @result;
    open(FILE, $rcmd->{log}) || die($!);
    while(<FILE>){
	if(/qerror/){
	    @result = ();
	    while(<FILE>){
		chomp;
		if(/^\d/){
		    my (undef, $x, $y, $qerror) = split(/\s+/, $_, 4);
		    push(@result, [$x, $y, $qerror]);
		}
	    }
	}
    }
    close(FILE);

    if($output =~ /g/ || $output =~ /show/){
	$rcmd->exec(
                    "pdf('./graph/".$filename."')",
		    'plot(rclust.som)'
		    );

	msg_gimv("graph/$filename") if ($output =~ /show/);
    }
    
    $rcmd->set_mode();
    return @result;
}






=head2 hclust

  Name: hclust   -   Hierarchical clustering analysis for given arraies

  Descriptions:
    Hierarchical clustering analysis methods for given arrays.

    Ward method uses the ward.D2 by default.

    Installation of amap library for R language is required.
      run R as a super user - sudo R - and type the followings:
        install.packages('amap')

  Usage:
    hclust(\@array1_of_values, \@array2_of_values, ...);
      or
    hclust(\@array1_of_values, \@array2_of_values, ..., -label => \@grouping_label);

 Options:
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the clustering graph (default: hclust.pdf)
   -method       the agglomeration method to be used (default: ward).
                   'ward', 'single', 'complate', 'average', 'centroid', 'median' or 'mcquitty'
   -distmethod   the distance measure method (default: correlation)
                   'euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'kendall'
                   'spearman', 'pearson' (not centered Pearson), 'abspearson' (Absolute Pearson),
                   'correlation' (Centered Pearson) or 'abscorrelation' (Absolute correlation)
                 this option is based on 'Dist' method in 'amap' library in R.
   -label        labels or names of the data series

  Author:
     Kazuki Oshita (cory@g-language.org)

  History:
     20130321-01 complete rewrite by cory, exported by default
     20070612-01 converted to Rcmd::Clustering
     20030904-01 initial posting (G::Tools::RCluster)

=cut


sub hclust {
    &opt_default(output => 'show', label => [], filename => 'hclust.pdf', method => 'ward', distmethod => 'correlation');

    my @args= opt_get(@_);
    my $output= opt_val('output');
    my @label=  @{opt_val('label')};
    my $filename= opt_val('filename');
    my $method= opt_val('method');
    my $distmethod= opt_val('distmethod');

    my @all_methods= ('ward', 'single', 'complate', 'average', 'centroid', 'median', 'mcquitty');
    unless (grep /^$method$/, @all_methods) {
        $method= 'ward';
    }
    $method = 'ward.D2' if ($method = 'word');

    my @all_dist_methods= ('euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'pearson',
                           'abspearson', 'correlation', 'abscorrelation', 'spearman', 'kendall');
    unless (grep /^$distmethod$/, @all_dist_methods) {
        $distmethod= 'pearson';
    }

    my $rcmd = Rcmd->new();

    my (%data_table, @R_names);
    for my $i (0 .. $#args) {
        my $R_name= 'array'.$i;
        push @R_names, $R_name;

        $data_table{$R_name}= $_[$i];
        $rcmd->array($R_name, @{$args[$i]});
    }

    # 'label' list object (if required)
    $rcmd->sarray('label', @label) if $#label > -1;
    my @R_commands= (
                     'CMP <- complete.cases('.join(', ', @R_names).')',
                     ('d.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).', row.names=label)')x!! ($#label > -1),
                     ('d.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).')')x!!                  ($#label == -1)
		     );

    for my $key (@R_names) {
        push @R_commands, $key.' <- '.$key.'[CMP]';
        push @R_commands, 'd.table$'.$key.' <- '.$key;
    }

    $rcmd->exec(
                @R_commands,

                'library("amap")',
                "pdf('./graph/".$filename."')",
                'hc <- hclust(Dist(d.table, method="'.$distmethod.'"), method="'.$method.'")',
                'plot(hc)',
               );

    msg_gimv('graph/'.$filename) if $output eq 'show';

    return '';
}





=head2 kmeans

  Name: kmeans   -   clustering with K-means method

  Description:
    
    Clustering with K-means method with using R language.
    Number of cluster centers can be given by -centers option (default: 10)
    and number of iterations is given by -iter.max (default: 10).

    Returned value corresponds to result$cluster of kmeans() in R.
    (a vector of cluster numbers to which each point is allocated)
    
  Usage:
    @cluster = $rcmd->kmeans(\@array1, \@array2, \@array3, ..., -label=>\@label);

    Arrays correspond to the columns (data series), and labels for each of
    these arrays can be given by -label option.

  Options:
   -label           labels or names of the data series.
   -centers          number of cluster centers (default: 5)
   -iter.max        number of iterations (default: 10)
   -filename        output filename of the graph (default: kmeans.pdf)
   -output          output toggle option (default: show)
                    "g" to generate graph without displaying.
   -sampledata      use sample data (default: 0)

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
   20070612-01 converted to Rcmd::Clustering
   20030904-01 initial posting (G::Tools::RCluster)

=cut


sub kmeans{
    my $rcmd = new Rcmd();
    $rcmd->set_mode('tmp');

    opt_default("iter.max"=>10, filename=>"kmeans.pdf", output=>"show", sampledata=>0, centers=>5);
    my @args = opt_get(@_);
    my $centers = opt_val("centers");
    my $iter = opt_val("iter.max");
    my $output = opt_val("output");
    my $filename = opt_val("filename");
    my $sampledata = opt_val("sampledata");
    my $label = opt_val("label") || '';

    if($sampledata){
	$rcmd->sample_data_for_clustering();
    }else{
	$rcmd->set_clust_data(@args, -label=>$label);
    }

    my @result = $rcmd->exec(
			     'require(stats)',
			     "rclust.kmeans<-kmeans(rclust,$centers,$iter)",
			     'rclust.kmeans$cluster'
			     );

    if($output =~ /g/ || $output =~ /show/){
	$rcmd->exec(
		    "pdf('./graph/".$filename."')",
		    'plot(rclust,col=rclust.kmeans$cluster)',
		    "points(rclust.kmeans\$centers, col=1:$centers,pch=8)"
		    );

	msg_gimv("graph/$filename") if ($output =~ /show/);
    }
    
    $rcmd->set_mode();
    return @result;
}



1;

