#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2014 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package Rcmd::Summary;

use strict;
use warnings;

use base 'Exporter';
our @EXPORT = qw(
		summary
		 );

use SubOpt;
use G::Messenger;
use G::Tools::Statistics;
use Rcmd::Handler;

#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

  Rcmd::Summary - Summary statistics of data via interfaces to R language.

=head1 DESCRIPTION

 Provides statistical summary about a given data, through interface method
 and numerous internal methods.

=head1 AUTHOR

Kazuharu Arakawa, gaou@sfc.keio.ac.jp

=cut

#::::::::::::::::::::::::::::::
#    Let the code begin...
#::::::::::::::::::::::::::::::


=head2 summary

  Name: summary   -   shows statistical summary for given data or data set

  Descriptions:
    This method shows statistical summary for given data or data set. 

    For a single array or array reference:
      Data is nominal: Shows boxplot of categories 
      Data is continuous: Shows histogram of distribution
    For two arrays references:
      Both is nominal: Shows cross table with Cramer's V
      Either is nominal: Shows boxplot of categories and ANOVA
      Both is continuous: Shows correlation matrix
    For more array references:
      All data is continuous: Shows correlation matrix

  Usage:
    summary(@array)
       or
    summary(\@array, \@array, ...);

  Options 
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the bar plot graph (default: *.pdf)

   Additional Options (continuous v.s. continuous):
   -method       "pearson", "spearman", or "kendall" (default:pearson)
                 Method used to calculate the correlation coefficient.
   -cor          show correlation in graph (default: r)
                 'r2' or 'r^2' to show R^2 (coefficient of determination).
   -label        optional array referece of labels for the given arrays

   Author:
     Kazuki Oshita (cory@g-language.org)
     Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

   History:
     20141114-01 merged to G-language GAE with overhaul
     20130321-01 initial posting
     
=cut

sub summary {
    my @args = opt_get(@_);

    if(scalar(@args) == 1 && ref $args[0] eq 'ARRAY'){
	univariate(@_);
    }elsif(ref $args[0] ne 'ARRAY'){
	univariate([@_]);
    }else{
	# array 1 and 2 are continuous scale (0) or nominal scale (1)
	my $a1_is_nominal= _is_nominal($args[0]);
	my $a2_is_nominal= _is_nominal($args[1]);
	
	if ($a1_is_nominal * $a2_is_nominal == 1) {
	    # nominal scale vs nominal scale
	    bivariate_n2n(@_);
	} elsif ($a1_is_nominal == $a2_is_nominal) {
	    # continuous scale vs continuous scale
	    multivariate(@_);
	} else {
	    # nominal scale vs continuous scale
	    bivariate_n2c(@_);
	}
    }

    return 1;
}

sub bivariate_n2n {
    my @args     = opt_get(@_);

    my @array1= @{shift @args};
    my @array2= @{shift @args};

    my $rcmd= Rcmd->new();
    $rcmd->sarray('array1', @array1);
    $rcmd->sarray('array2', @array2);
    
    my $cramer= $rcmd->exec(
			    'cat("[Cross Table]\n")',
			    # complete case about array1 and array2
			    'safe.case <- complete.cases(array1, array2)',
			    'array1 <- array1[safe.case]',
			    'array2 <- array2[safe.case]',
			    # make cross table
			    'cross.table <- table(array1, array2)',
			    # re-format cross table to readable report
			    'ftable(cross.table)',
			    ##  calculate Cramer's V
			    # expected value
			    'ex <- outer(rowSums(cross.table), colSums(cross.table)) / sum(cross.table)',
			    # chi square
			    'chisq <- sum((cross.table - ex)^2 / ex)',
			    # phi coefficient
			    'phi <- sqrt(chisq / sum(cross.table))',
			    # cramer's V
			    'phi / sqrt(min(nrow(cross.table), ncol(cross.table)) - 1)'
			    );
    print "\nCramer\'s V : $cramer\n";
}



sub bivariate_n2c {
    &opt_default(output => 'show', filename => 'bivariate.pdf');

    my @args     = opt_get(@_);
    my $output   = opt_val('output');
    my $filename = opt_val('filename');

    my (@nominal, @continuous);
    if (_is_nominal(\@{$args[0]})) {
	@nominal=    @{shift @args};
	@continuous= @{shift @args};
    } else {
	@continuous= @{shift @args};
	@nominal=    @{shift @args};
    }

    my (@vector, @names, %table);
    # table of nominal array and continuous array
    for my $i (0 .. ($#continuous < $#nominal ? $#continuous : $#nominal)) {
	push @{$table{$nominal[$i]}}, $continuous[$i];
    }

    print "[Summary]\n  Label\tNumber\tMean\tSD\n";
    for my $name (sort {$a cmp $b} keys %table) {
	my @items = @{$table{$name}};
	my $num   = $#items + 1;
	my $mean  = sprintf("%.3f", mean(@items));
	my $sd    = sprintf("%.5f", standard_deviation(@items));

	print "$name\t$num\t$mean\t$sd\n";
	push @vector, 'c('.join(' ,', @items).')';
	push @names,  $name;
    }
    print "\n";

    my $rcmd= Rcmd->new();
    $rcmd->array('continuous', @continuous);
    $rcmd->sarray('nominal',   @nominal);
    $rcmd->exec(
		'anova(aov(continuous ~ factor(nominal)))',
		"pdf('./graph/".$filename."')",
		'boxplot('.join(', ', @vector).', names=c(\''.join('\', \'', @names).'\'))',
		);

    msg_gimv('graph/'.$filename) if $output eq 'show';
}



sub univariate {
    &opt_default(output => 'show', filename => 'distribution.pdf');

    my @args      = opt_get(@_);
    my @array     = @{+shift @args};
    my $output    = opt_val('output');
    my $filename  = opt_val('filename');

    my $N        = scalar(@array);
    printf("[Summary]\n");
    printf("N:              %d\n", $N);

    if (_is_nominal(opt_get(@_))) {
	my %count;
	$count{$_}++ for (@array);

	printf("categories:     %d\n", scalar(keys(%count)));
	# sort given array by descending order
	my @sorted_label= sort { $count{$a} <=> $count{$b} } keys %count;

	my $rcmd= Rcmd->new();
	$rcmd->array('summary', map { $count{$_} } @sorted_label);
	$rcmd->exec(
		    # labelling
		    'label <- c("'.join('", "', @sorted_label).'")'."\n".
		    
		    # draw barplot graph
		    "pdf('./graph/".$filename."')\n".
		    "barplot(summary, names.arg=label)"
		    );

    }else{
	my $mean     = mean(@array);
	my $sd       = standard_deviation(@array);

	# http://www.jmp.com/support/help/Statistical_Details_for_the_Distribution_Platform.shtml
	my $skewness;
	$skewness += ((($array[$_] - $mean)/$sd) ** 3) * $N / (($N-1)*($N-2)) for (0..$#array);

	my $kurtosis;
	$kurtosis += ((($array[$_] - $mean)/$sd) ** 4) for (0..$#array);
	$kurtosis *= ($N * ($N + 1))/(($N-1)*($N-2)*($N-3));
	$kurtosis -= (3*(($N-1)**2))/(($N-2)*($N-3));
	
	my @sorted = sort { $a <=> $b } @array;

	# summary about basic statistics
	printf("Mean:           %.3f\n", $mean);
	printf("Median:         %.3f\n", median(@array));
	print 'Lower 95%', sprintf(" Mean: %.3f\n", mean(@sorted[0..int($N * 0.95)]));
	print 'Upper 95%', sprintf(" Mean: %.3f\n", mean(@sorted[int($N*0.05)..$#array]));
	printf("SD:             %.3f\n", $sd);
	printf("Kurtosis:       %.3f\n", $kurtosis);
	printf("Skewness:       %.3f\n", $skewness);
	
	my $rcmd= Rcmd->new();
	$rcmd->array('array',  @array);
	$rcmd->exec(
		    'pdf("./graph/'.$filename.'")',
		    'par(mfrow=c(1,2))',
		    'hist(array, main="", xlab="")',
		    'boxplot(array)',
		    );
	
    }
    # show graph image (when -output => show)
    msg_gimv('graph/'.$filename) if $output eq 'show';
}




=head2 multivariate

  Name: multivariate   -   shows statistic summary for multi-arraies

  Descriptions:
    Shows statistic summary for bivariate arrays like JMP style.
    This method shows scatter matrix and scatter plot graph.

  Usage:
    multivariate(\@array1_of_values, \@array2_of_values, ...);

  Options:
   -output       output toggle option (default: show)
                 "g" to generate graph without displaying.
   -filename     output filename of the scatter plot (default: scatter_plot.pdf)
   -method       "pearson", "spearman", or "kendall" (default:pearson)
                 Method used to calculate the correlation coefficient.
   -cor          show correlation in graph (default: r)
                 'r2' or 'r^2' to show R^2 (coefficient of determination).
   -label        optional array reference with labels for each of the arrays

   Author:
     Kazuki Oshita (cory@g-language.org)

   History:
     20141113-01 merging to G-language GAE and overhaul
     20130321-01 initial posting
     
=cut

sub multivariate {
    &opt_default(output => 'show', label=>[], method => 'pearson', cor => 'r', filename => 'scatter_plot.pdf');

    my @args     = opt_get(@_);
    my $output   = opt_val('output');
    my $filename = opt_val('filename');
    my $label    = opt_val('label');
    my $method   = opt_val('method'); # [pearson] pearson, kendall, spearman
    my $show_r   = opt_val('cor'); # show R value or R^2 value in plot graph

    # Method for the correlation coefficient.
    $method= 'pearson' if ($method ne 'kendall' && $method ne 'spearman');

    # $show_r is 'R' or 'R^2'
    $show_r= 'r' if (lc($show_r) ne 'r^2' && lc($show_r) ne 'r2');

    my $rcmd= Rcmd->new();

    # create data table
    my (%data_table, @R_names);
    for my $i (0 .. $#args) {
	my $R_name = scalar(@$label) > 1 ? $$label[$i] : 'array'.$i;

	push @R_names, $R_name;

	$data_table{$R_name}= $_[$i];
	$rcmd->array($R_name, @{$args[$i]});
    }

    my @R_commands= (
		     'CMP <- complete.cases('.join(', ', @R_names).')',
		     'd.table <- data.frame('.$R_names[0].'=1:'.($#{$data_table{$R_names[0]}}+1).')',
		    );

    for my $key (@R_names) {
	push @R_commands, $key.' <- '.$key.'[CMP]';
	push @R_commands, 'd.table$'.$key.' <- '.$key;
    }

    $rcmd->exec(
		@R_commands,

		"pdf('./graph/".$filename."')",

		# show R (or R^2) in graph
		'low.panel <- function (x, y, ...) {',
		('  text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r = ",   signif(cor(x, y), 2)), cex = 2)')x!!   ($show_r eq 'r'),

		('  text(mean(par("usr")[1:2]), mean(par("usr")[3:4]), paste("r^2 = ", signif(cor(x, y)^2, 2)), cex = 2)')x!! (lc($show_r) eq 'r^2' || lc($show_r) eq 'r2'),
		'}',

                'panel.hist <- function(x, ...){',
                   'usr <- par("usr"); on.exit(par(usr))',
                   ' par(usr = c(usr[1:2], 0, 1.5) )',
                   ' h <- hist(x, plot = FALSE)',
                   'breaks <- h$breaks; nB <- length(breaks)',
                   ' y <- h$counts; y <- y/max(y)',
                   'rect(breaks[-nB], 0, breaks[-1], y, col="gray", ...)',
                 '}',

		# draw scatter plot
		'pairs(d.table, upper.panel=low.panel, diag.panel=panel.hist)',

		# cor
                'cat("[Correlation matrix]\n")',
 		"cor(d.table, method='".$method."')",
	       );

    # show scatter plot in X11
    msg_gimv('graph/'.$filename) if $output eq 'show';

    return '';
}


1;
