#!/usr/bin/perl 
#===============================================================================
# Copyright 2012 K.K.DNAFORM
# This file is part of idr_paraclu program.
# Idr_paraclu is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, any later version.
#
# Idr_paraclu is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Foobar. If not, see <http://www.gnu.org/licenses/>.
#
#         FILE:  removeSimilarCluster.pl
#
#        USAGE:  removeSimilarCluster.pl input output 
#					input				input file
#					output				output file
#
#  DESCRIPTION:  Remove the cluster overlapped over 99% with other clusters
#				 Larger clusters are discarded.
#
#      OPTIONS:  ---
# REQUIREMENTS:  ---
#         BUGS:  ---
#        NOTES:  ---
#       AUTHOR:  Ohmiya
#      COMPANY:  
#      VERSION:  1.0
#      CREATED:  2011/10/28
#     REVISION:  ---
#===============================================================================

use strict;
use warnings;

## removing cluster list
my %remove = ();

## temp file
my $tempfile = "/var/tmp/test.$$";

## intersectBed program
my $intersectBed = "intersectBedOhmiya";

if( $#ARGV != 1 ) {
	print "Usage : removeSimilarCluster.pl input output\n";
	exit 1;
}

chomp( my $inputFileName = $ARGV[0] );
chomp( my $outputFileName = $ARGV[1] );

## -----------------------------------------
## Create the removing cluster list
## -----------------------------------------
open( TEMP, "$intersectBed -s -wo -r -a $inputFileName -b $inputFileName -f 0.90 | " );
while( my $tempdata = <TEMP> ) {
	chomp( $tempdata );
	my @tempList = split( /\s+/, $tempdata );
	my $ID1 = $tempList[0] . "_" . $tempList[1] . "_" . $tempList[2] . "_" .
			  $tempList[5];
	my $ID2 = $tempList[6] . "_" . $tempList[7] . "_" . $tempList[8] . "_" .
			  $tempList[11];
	if( $ID1 eq $ID2 ) {
		next;
	}
	my $lengthA = $tempList[2] - $tempList[1];
	my $lengthB = $tempList[8] - $tempList[7];
	my $removeID = $tempList[0] . "_" . $tempList[1] . "_" . 
				   $tempList[2] . "_" . $tempList[5];
	if( $lengthA < $lengthB ) {
		$removeID = $tempList[6] . "_" . $tempList[7] . "_" .
					$tempList[8] . "_" . $tempList[11];
	}
	$remove{$removeID} = 1;
}
close( TEMP );
unlink( $tempfile );


## -----------------------------------------
## Remove the larger one of similar clusters 
## -----------------------------------------
open INPUT, "<$inputFileName" or die "Can't open $inputFileName : $!";
open OUTPUT, ">$outputFileName" or die "Can't open $outputFileName : $!";
while( my $inputdata = <INPUT> ) {
	chomp( $inputdata );
	my @inputList = split( /\s+/, $inputdata );
	my $key = $inputList[0] . "_" . $inputList[1] . "_" .
			  $inputList[2] . "_" . $inputList[5];
	unless( exists( $remove{$key} ) ) {
		print OUTPUT "$inputdata\n";
	}
}
close( INPUT );
close( OUTPUT );

