#!/usr/bin/perl -w

##############################################################################
# COPY from predict_base_pair.pl 
#Given a set of H-Bond bonding matrice, predict and evaluate beta-strand pair
#Matrix Format: Name, Sequence, SS, BP1, BP2, matrix
#Input Parameters: base pair program, input directory, output file  
#Output file format: name, seq, ss, bp1, bp2, strand-list, true pair list,predicted 
#pair list, number of correct (true num, predict num, correct)
#to the standard output: overall precision and recall.  
#Author: Jianlin Cheng
#Date: 9/22/2004
#############################################################################

if (@ARGV != 3)
{
	die "need 3 params:base pair program, input directory of h-bond probability file, output file\n"; 
}

$predictor = shift @ARGV;
$input_dir = shift @ARGV;
$output_file = shift @ARGV; 

if (! -f $predictor)
{
	die "can't find base pair program: $predictor\n";  
}

if (! -d $input_dir)
{
	die "input directory doesn't exist.\n"; 
}

if ( substr($input_dir, length($input_dir) - 1, 1) ne "/" )
{
        $input_dir .= "/";
}

opendir(DIR, $input_dir) || die "can't open input directory.\n";
@file_list = readdir(DIR);
closedir(DIR); 

open(OUTPUT, ">$output_file") || die "can't create output file.\n"; 

$true_total = 0;
$pre_total = 0; 
$corr_total = 0; 

$naive_pre_total = 0; 
$naive_corr_total = 0; 

#confusion table for strand pairing accuracy
$tp = 0;
$fn = 0;
$fp = 0;
$tn = 0; 
@pre_pair_num = ();
@true_pair_num = (); 

foreach $file(@file_list)
{
	if ($file eq "." || $file eq "..")
	{
		next; 
	}
	$filename = $input_dir . $file;
	open(INPUT, "$filename") || die "can't read input file $filename\n"; 
	$name = <INPUT>;
	$seq = <INPUT>;
	$ss = <INPUT>;
	$bp1 = <INPUT>;
	$bp2 = <INPUT>; 
	close INPUT; 

	$tmp = $file . ".tmp"; 
	$res = system("$predictor $filename > $tmp"); 
	if ($res != 0)
	{
		print "error happens in process file $filename\n"; 
	}
	open(RES, "$tmp") || die "can't read result file, $tmp\n";
	<RES>;
	$strand_list = <RES>;
	<RES>;
	$pre_list = <RES>; 
	<RES>;
	$true_list = <RES>;

	$corr_num = <RES>;
	chomp $corr_num; 
	($other, $corr_num) = split(/: /, $corr_num); 
	$pre_num = <RES>;
	($other, $pre_num) = split(/: /, $pre_num); 
	chomp $pre_num; 
	$true_num = <RES>;
	chomp $true_num; 
	($other, $true_num) = split(/: /, $true_num); 
	$naive_pre = <RES>; 
	chomp $naive_pre; 
	($other, $naive_pre) = split(/: /, $naive_pre); 
	$naive_corr = <RES>; 
	chomp $naive_corr; 
	($other, $naive_corr) = split(/: /, $naive_corr); 

	$corr_total += $corr_num;
	$pre_total += $pre_num;
	$true_total += $true_num; 
	$naive_pre_total += $naive_pre;
	$naive_corr_total += $naive_corr; 
	print OUTPUT  "$name$seq$ss$bp1$bp2$strand_list$true_list$pre_list$true_num $pre_num $corr_num $naive_pre $naive_corr\n\n";  
	`rm $tmp`; 

	#compute the confution table
	chomp $strand_list;
	@strands = split(/, /, $strand_list);  
	$strand_num = @strands;
	$tot = $strand_num * ($strand_num-1) / 2; 
	$x = $pre_num;
	$y = $corr_num;
	$z = $true_num;
	$tp += $y;
	$fn += ($z - $y);
	$fp += ($x - $y);
	$tn += ($tot - $x - $z + $y);
	push @pre_pair_num, $pre_num;
	push @true_pair_num, $true_num; 
}
print OUTPUT "\n#predicted pair num VS true pair num per chain\n";
print OUTPUT "pre_num <- c(", join(",",@pre_pair_num), ")\n";
print OUTPUT "true_num <- c(", join(",", @true_pair_num), ")\n"; 
close OUTPUT; 

print "pair precision: ", $corr_total / $pre_total, "\n"; 
print "pair recall: ", $corr_total / $true_total, "\n"; 

print "naive pair precision: ", $naive_corr_total / $naive_pre_total, "\n";
print "naive pair recall: ", $naive_corr_total / $true_total, "\n"; 

print "paring accuracy: ", ($tp+$tn) / ($tp+$tn+$fp+$fn), "\n"; 
print "paring correlation coefficient:", ($tp*$tn-$fp*$fn) / sqrt( ($tp+$fn) * ($tp+$fp) * ($tn+$fn) * ($tn+$fp) ); 








