#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2016 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package Rcmd::Multivariate;

use strict;
use warnings;

use base 'Exporter';
our @EXPORT= qw(
		pca
		lda
		);

use SubOpt;
use G::Messenger;
use Rcmd::Handler;



#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::



=head1 NAME

  Rcmd::Multivariate - Statistical tools for multivariate analysis

=head1 DESCRIPTION

  This class is a part of G-language Genome Analysis Environment, 
  collecting interfaces to clustering algorithms of R language.

=head1 AUTHOR

  Kazuki Oshita, E<lt>cory@g-language.org<gt>

=cut


#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::



=head2 pca

  Name: pca   -   principal component analysis for given arraies

  Descriptions:
    This method shows eigenvector matrix, factor loadin gmatrix and
    canonical correlation diagram by principal component analysis (PCA).

  Usage:
    pca(\@array1_of_values, \@array2_of_values, ...);
      or
    pca(\@array1_of_values, \@array2_of_values, ..., -label => \@label_of_factors);

  Options:
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the bar plot graph (default: pca_plot.pdf)
   -label        labels or names of the factors (should be the same size as the given array)
   -category     category labels of the arrays (size should be the same as the number of arrays)

   Author:
     Kazuki Oshita (cory@g-language.org)
     Kazuharu Arakawa (gaou@g-language.org)

   History:
     20141111-01 major overhaul upon merging to G-language GAE.
     20130321-01 initial posting
     
=cut

sub pca {
    &opt_default(output => 'show', label => [], category => [], filename => 'pca_plot.pdf');

    my @args     = opt_get(@_);
    my $output   = opt_val('output');
    my @label    = @{opt_val('label')};
    my @category = @{opt_val('category')};
    my $filename = opt_val('filename');

    my $rcmd = Rcmd->new();

    my (%data_table, @R_names);
    for my $i (0 .. $#args) {
	my $R_name= 'Array' . $i;
	push @R_names, $R_name;

	$data_table{$R_name}= $_[$i];
	$rcmd->array($R_name, @{$args[$i]});
    }

    unless($#label > -1){
	push(@label, 'Factor' . $_) for (1..(scalar(@{$args[0]})));
    }
    $rcmd->sarray('label', @label);

    $rcmd->sarray('category', @category) if($#category > -1);
    my @R_commands= (
		     'CMP <- complete.cases('.join(', ', @R_names).')',
		     ('d.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).', row.names=label)')
		    );

    for my $key (@R_names) {
	push @R_commands, $key.' <- '.$key.'[CMP]';
	push @R_commands, 'd.table$'.$key.' <- '.$key;
    }

    my $col = $#category > -1 ? 'text(scores[,1], scores[,2], col=as.numeric(unclass(factor(category))), labels=category)' : '';
    my $dot = $#category > -1 ? 'n' : 'p", pch=".';

    my @res = $rcmd->exec(
		@R_commands,
                't.table = t(d.table)',
 		'PC <- prcomp(t.table)',
                'summary(PC)$importance[,c(1:6)]',

                'cat("\n[PC1 Top 10 Factors]\n")',
		'pc1 = PC$rotation[,1]',
                'top10 = order(abs(pc1), decreasing=TRUE)[1:10]',
                'top = cbind(label[top10], round(pc1[top10], digits=3))',
                'colnames(top) = c("Factor", "Loading")',
                'rownames(top) = label[top10]',
                'print(top[,2], quote=FALSE, row.names=FALSE)',

                'cat("\n[PC2 Top 10 Factors]\n")',
		'pc2 = PC$rotation[,2]',
                'top10 = order(abs(pc2), decreasing=TRUE)[1:10]',
                'top = cbind(label[top10], round(pc2[top10], digits=3))',
                'colnames(top) = c("Factor", "Loading")',
                'rownames(top) = label[top10]',
                'print(top[,2], quote=FALSE, row.names=FALSE)',

                'scores=PC$x',
		"pdf('./graph/".$filename."')",
                'label1 <- paste("PC1 (", round(summary(PC)$importance[2,1]*100), "% )")',
                'label2 <- paste("PC2 (", round(summary(PC)$importance[2,2]*100), "% )")',
 		'plot(scores[,1], scores[,2], type="' . $dot . '", main="Scatter plot of the PCA scores", xlab=label1, ylab=label2)',
		$col
	       );

    msg_gimv('graph/'.$filename) if $output eq 'show';

    return '';
}



1;
