#!/usr/bin/perl -w

##############################################################################
# COPY from predict_greedy_pair.pl 
#Given a set of H-Bond bonding matrice
#evaluate: strand pair, direction, alignment, sheet coverage/number 
#Matrix Format: Name, Sequence, SS, BP1, BP2, matrix
#Input Parameters: greedy pair program, input directory, output file  
#Output file format: name, seq, ss, bp1, bp2, strand-list, true pair list,predicted 
#pair list, number of correct (true num, predict num, correct)
#to the standard output: overall precision and recall.  
#Author: Jianlin Cheng
#Date: 9/22/2004
#############################################################################

#compare two sheets
sub comp_sheet 
{
	#params: predicted sheet, true sheet
	my ($sheetA, $sheetB) = @_;
	my @sheetA = split(/\*/, $sheetA);
	my @sheetB = split(/\*/, $sheetB); 
	my $sizeA = @sheetA;
	my $sizeB = @sheetB; 
	my $record1;
	my $record2; 
	#correct num
	my $rcorr = 0;
	foreach $record1(@sheetA)
	{
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
    		$record1 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
		my $id1 = $1;
		my $id2 = $4; 
		foreach $record2(@sheetB)
		{
    			$record2 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
			if ( ($id1 == $1 && $id2 == $4) || ($id1 == $4 && $id2 == $1) )
			{
				$rcorr++; 
				last; 
			}
		}
		
	}
	#return precision and recall
	return ($rcorr/$sizeA, $rcorr/$sizeB); 
}

if (@ARGV != 3)
{
	die "need 3 params:base pair program, input directory of h-bond probability file, output file\n"; 
}

$predictor = shift @ARGV;
$input_dir = shift @ARGV;
$output_file = shift @ARGV; 

if (! -f $predictor)
{
	die "can't find base pair program: $predictor\n";  
}

if (! -d $input_dir)
{
	die "input directory doesn't exist.\n"; 
}

if ( substr($input_dir, length($input_dir) - 1, 1) ne "/" )
{
        $input_dir .= "/";
}

opendir(DIR, $input_dir) || die "can't open input directory.\n";
@file_list = readdir(DIR);
closedir(DIR); 

open(OUTPUT, ">$output_file") || die "can't create output file.\n"; 

$true_total = 0;
$pre_total = 0; 
$corr_total = 0; 

$naive_pre_total = 0; 
$naive_corr_total = 0; 

#confusion table for strand pairing accuracy
$tp = 0;
$fn = 0;
$fp = 0;
$tn = 0; 
@pre_pair_num = ();
@true_pair_num = (); 

########################################################
#evaluation of fullly predicted pairs:
#number of true parallel
#t_para + t_anti = true_total 
$t_para = 0;
#number of true antiparallel
$t_anti = 0;
#number of true bridge
$t_brig = 0; 
#number of correctly predicted parallel
$c_para = 0;
#number of correctly predicted anti-parallel
$c_anti = 0;
#number of correctly predicted bridge
$c_brig = 0; 
#evaluation of alighments once the direction is right
@para_dist = ();
@anti_dist = ();
@brig_dist = ();
$corr_ali_num = 0; 
#evaluation of alignments of all pairs assming the direction is known (comparing with Thornton)
@apara_dist = ();
@aanti_dist = ();
@abrig_dist = ();
$acorr_ali_num = 0; 

#evaluate the beta-sheets
@true_sheet_num = ();
@pre_sheet_num = (); 
#precision of sheet prediction
@sheet_pre = ();
#recall of sheet precision
@sheet_rec = (); 
########################################################


foreach $file(@file_list)
{
	if ($file eq "." || $file eq "..")
	{
		next; 
	}
	$filename = $input_dir . $file;
	open(INPUT, "$filename") || die "can't read input file $filename\n"; 
	$name = <INPUT>;
	$seq = <INPUT>;
	$ss = <INPUT>;
	$bp1 = <INPUT>;
	$bp2 = <INPUT>; 
	close INPUT; 

	$tmp = $file . ".tmp"; 
	$res = system("$predictor $filename > $tmp"); 
	if ($res != 0)
	{
		print "error happens in process file $filename\n"; 
	}
	open(RES, "$tmp") || die "can't read result file, $tmp\n";
	<RES>;
	$strand_list = <RES>;
	<RES>;
	$pre_list = <RES>; 
	<RES>;
	$true_list = <RES>;

	$corr_num = <RES>;
	chomp $corr_num; 
	($other, $corr_num) = split(/: /, $corr_num); 
	$pre_num = <RES>;
	($other, $pre_num) = split(/: /, $pre_num); 
	chomp $pre_num; 
	$true_num = <RES>;
	chomp $true_num; 
	($other, $true_num) = split(/: /, $true_num); 
	$naive_pre = <RES>; 
	chomp $naive_pre; 
	($other, $naive_pre) = split(/: /, $naive_pre); 
	$naive_corr = <RES>; 
	chomp $naive_corr; 
	($other, $naive_corr) = split(/: /, $naive_corr); 

	$corr_total += $corr_num;
	$pre_total += $pre_num;
	$true_total += $true_num; 
	$naive_pre_total += $naive_pre;
	$naive_corr_total += $naive_corr; 

	#alignment of predicted pairs
	$ali_pre_pair = <RES>;
	#print $ali_pre_pair; 
	chomp $ali_pre_pair;
	#shift the constraints
	<RES>;
	<RES>;
	#true alignment of true pairs
	$ali_true = <RES>;
	#print $ali_true; 
	chomp $ali_true;
	#predicted alignments of true pairs
	$ali_pre_true = <RES>;
	#print $ali_pre_true;
	chomp $ali_pre_true; 
	

	print OUTPUT  "$name$seq$ss$bp1$bp2$strand_list$true_list$pre_list$true_num $pre_num $corr_num $naive_pre $naive_corr\n$ali_pre_pair$ali_true$ali_pre_true\n";  
	`rm $tmp`; 

	#compute the confution table
	chomp $strand_list;
	@strands = split(/, /, $strand_list);  
	$strand_num = @strands;
	$tot = $strand_num * ($strand_num-1) / 2; 
	$x = $pre_num;
	$y = $corr_num;
	$z = $true_num;
	$tp += $y;
	$fn += ($z - $y);
	$fp += ($x - $y);
	$tn += ($tot - $x - $z + $y);
	push @pre_pair_num, $pre_num;
	push @true_pair_num, $true_num; 

	#compute alignment accuracy of predicted pairs

	($left, $ali_true) = split(/: /, $ali_true);
	@ali_true = split(/, /, $ali_true);
	if (@ali_true != $true_num)
	{
		die "number of true alignments not equal number of true pairs:$name";
	}
	for ($i = 0; $i < @ali_true; $i++)
	{
		$rec = $ali_true[$i]; 
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
		#print "rec: $rec\n";
		if ($rec =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/)
		{
			if ($1 > $4)
			{
				die "assumption violation: $name"; 
			}
			if ($7 == 0)
			{
				$t_brig++;
			}
			elsif ($7 == 1)
			{
				$t_anti++;
			}
			else
			{
				$t_para++; 
			}
		}
		else
		{
			die "alignment format error: $rec,$name"; 
		}
	}

	($left, $ali_pre_pair) = split(/: /, $ali_pre_pair);
	@ali_pre_pair = split(/, /, $ali_pre_pair); 
	for ($i = 0; $i < @ali_pre_pair; $i++)
	{
		$rec = $ali_pre_pair[$i]; 
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
		if ($rec !~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/)
		{
			print "$rec\n";
			die "format error: $ali_pre_pair, $name\n"; 
		}
		$pre_id1 = $1; $pre_id2 = $4; $pre_dir = $7; 
		$start1 = $8; $end1 = $9;  
		$start2 = $10; #$end2 = $11; 
		#print "test position:$start1,$end1,$start2,$end2\n";
		#<STDIN>;
		#check assumptions
		if ($start1 > $end1)
		{
			die "the start position of strand 1 should be less than end position of strand1.\n"; 
		}
		if ($pre_id1 > $pre_id2)
		{
			die "assumption violation: $name"; 
		}
		
		for ($j = 0; $j < @ali_true; $j++)
		{
			$rec = $ali_true[$j]; 
			#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
			$rec =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
			if ($pre_id1 == $1 && $pre_id2 == $4)
			{
				if ($pre_dir == $7)
				{
					if ($pre_dir == 0)
					{
						$c_brig++; 
					}
					elsif ($pre_dir == 1)
					{
						$c_anti++; 
					}
					else
					{
						$c_para++; 
					}

					#compute the alignment distance
					$st1 = $8; $en1 = $9;  
					$st2 = $10; #$en2 = $11; 
					if ($st1 > $en1)
					{
						die "start position is greater than end position:$name "; 
					}
					#generate pairs for both alginments
					@x1 = (); @y1 = ();
					@x2 = (); @y2 = (); 
					if ($pre_dir == 0)
					{
						push @x1, $start1;
						push @y1, $start2; 
						push @x2, $st1; 
						push @y2, $st2; 
					}
					else
					{

						$m = $start2; 
						for ($k = $start1; $k <= $end1; $k++)
						{
							push @x1, $k;
							push @y1, $m; 
							if ($pre_dir == 1) #antiparallel
							{
								$m--;  
							}
							else #parallel
							{
								$m++; 
							}
						}

						$m = $st2; 
						for ($k = $st1; $k <= $en1; $k++)
						{
							push @x2, $k;
							push @y2, $m; 
							if ($pre_dir == 1) #antiparallel
							{
								$m--;  
							}
							else #parallel
							{
								$m++; 
							}
						}

					}
					#check if there is any same pair and compute aligment distance
					$ali_dist = -1; 
					for ($k = 0; $k < @x1; $k++)
					{
						for ($m = 0; $m < @x2; $m++)
						{
							if ($x1[$k] == $x2[$m] && $y1[$k] == $y2[$m])
							{
								$ali_dist = 0; 
							}
						}
					}
					if ($ali_dist == -1)
					{
						$ali_dist = $y1[0] - $y2[0]; 
					}
					else
					{
						$corr_ali_num++; 
					}
					if ($pre_dir == 0)
					{
						push @brig_dist, $ali_dist;
					}
					elsif ($pre_dir == 1)
					{
						push @anti_dist, $ali_dist;
					}
					else
					{
						push @para_dist, $ali_dist; 
					}
				}
			}
			
		}
	}

	######################Evaluate alignments for all strand pairs assuming direction is known
	($left, $ali_pre_true) = split(/: /, $ali_pre_true);
	@ali_pre_true = split(/, /, $ali_pre_true); 
	for ($i = 0; $i < @ali_pre_true; $i++)
	{
		$rec = $ali_pre_true[$i]; 
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
		if ($rec !~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/)
		{
			print "$rec\n";
			die "format error: $ali_pre_true, $name\n"; 
		}
		$pre_id1 = $1; $pre_id2 = $4; $pre_dir = $7; 
		$start1 = $8; $end1 = $9;  
		$start2 = $10; #$end2 = $11; 
		#print "test position:$start1,$end1,$start2,$end2\n";
		#<STDIN>;
		#check assumptions
		if ($start1 > $end1)
		{
			die "the start position of strand 1 should be less than end position of strand1.\n"; 
		}
		if ($pre_id1 > $pre_id2)
		{
			die "assumption violation: $name"; 
		}
		
		for ($j = 0; $j < @ali_true; $j++)
		{
			$rec = $ali_true[$j]; 
			#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
			$rec =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
			if ($pre_id1 == $1 && $pre_id2 == $4)
			{
					
				if ($pre_dir != $7)
				{
					die "for true pairs, direction should be same,$name"; 
				}

				#compute the alignment distance
				$st1 = $8; $en1 = $9;  
				$st2 = $10; #$en2 = $11; 
				if ($st1 > $en1)
				{
					die "start position is greater than end position:$name "; 
				}
				#generate pairs for both alginments
				@x1 = (); @y1 = ();
				@x2 = (); @y2 = (); 
				if ($pre_dir == 0)
				{
					push @x1, $start1;
					push @y1, $start2; 
					push @x2, $st1; 
					push @y2, $st2; 
				}
				else
				{
					$m = $start2; 
					for ($k = $start1; $k <= $end1; $k++)
					{
						push @x1, $k;
						push @y1, $m; 
						if ($pre_dir == 1) #antiparallel
						{
							$m--;  
						}
						else #parallel
						{
							$m++; 
						}
					}

					$m = $st2; 
					for ($k = $st1; $k <= $en1; $k++)
					{
						push @x2, $k;
						push @y2, $m; 
						if ($pre_dir == 1) #antiparallel
						{
							$m--;  
						}
						else #parallel
						{
							$m++; 
						}
					}

				}
				#check if there is any same pair and compute aligment distance
				$ali_dist = -1; 
				for ($k = 0; $k < @x1; $k++)
				{
					for ($m = 0; $m < @x2; $m++)
					{
						if ($x1[$k] == $x2[$m] && $y1[$k] == $y2[$m])
						{
							$ali_dist = 0; 
						}
					}
				}
				if ($ali_dist == -1)
				{
					$ali_dist = $y1[0] - $y2[0]; 
				}
				else
				{
					$acorr_ali_num++; 
				}
				if ($pre_dir == 0)
				{
					push @abrig_dist, $ali_dist;
				}
				elsif ($pre_dir == 1)
				{
					push @aanti_dist, $ali_dist;
				}
				else
				{
					push @apara_dist, $ali_dist; 
				}
				
			}
			
		}
	}
	##############End of evaluation of all alignments of all true strand pairs######################

	#########################Evaluation of beta-sheet prediction####################################

	#construct true sheets (including isolated bridge or not????) 
	@tsheets = ();

	#include all pairs (including beta-bridge)
	#@true_pairs = @ali_true;

	#filter out isolated bridge
	foreach $rec (@ali_true)
	{
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
		$rec =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
		if ( $2 != $3 && $5 != $6)
		{
			push @true_pairs, $rec; 
		}
	}

	while(@true_pairs)
	{
		@sheet = ();
		$seed = shift @true_pairs; 
	#	print "seed: $seed\n"; 
		push @sheet, $seed; 
		while (@true_pairs)
		{
			#check if any pair is connected with the sheet	
			$found = 0;
			@temp_pairs = (); 
			foreach $rec1(@true_pairs)
			{
				#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
				$rec1 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
				$id1 = $1; $id2 = $4;
				$flag = 0; 
				foreach $rec2 (@sheet)
				{
					$rec2 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
					if ($id1 == $1 || $id1 == $4 || $id2 == $1 || $id2 == $4)
					{
						$found = 1; 
						$flag = 1; 
					}
				}
				if ($flag == 1)
				{
					#print "rec chosen: $rec1\n"; 
					push @sheet, $rec1; 
				}
				else
				{
					#print "not chosen: $rec1\n"; 
					push @temp_pairs, $rec1; 
				}
			}

			if ($found == 0 && @sheet > 0)
			{
				#put one sheet into a list
				#print "push ...., @sheet\n"; 
				push @tsheets, join("*",@sheet); 
				last; 
			}
			@true_pairs = @temp_pairs; 
		}
		if (@true_pairs == 0 && @sheet > 0)
		{
			#print "push ...., @sheet\n"; 
			push @tsheets, join("*",@sheet); 
		}
	}

	#print out the true sheets
	$tsheet_num = @tsheets;
	#print "***true sheet number: $tsheet_num\n"; 
	#for($r = 0; $r < @tsheets; $r++)
	#{
	#	print "$tsheets[$r]"; 
	#	print "\n"; 
	#}

	#############construct predicted sheets#######################
	@psheets = ();

	#including isolated bridge or not????) 
	#@pre_pairs = @ali_pre_pair; 

	#filter out isolated bridge
	foreach $rec (@ali_pre_pair)
	{
		#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
		$rec =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
		if ( $2 != $3 && $5 != $6)
		{
			push @pre_pairs, $rec; 
		}
	}

	while(@pre_pairs)
	{
		@sheet = ();
		$seed = shift @pre_pairs; 
	#	print "seed: $seed\n"; 
		push @sheet, $seed; 
		while (@pre_pairs)
		{
			#check if any pair is connected with the sheet	
			$found = 0;
			@temp_pairs = (); 
			foreach $rec1(@pre_pairs)
			{
				#vars: 1:id1,2:pos11,3:pos12,4:id2,5:pos21,6:pos22,7:direction,8:start1,9:end1,10:start2,11:end2 
				$rec1 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
				$id1 = $1; $id2 = $4;
				$flag = 0; 
				foreach $rec2 (@sheet)
				{
					$rec2 =~ /^\[\((\d+),(\d+),(\d+)\),\((\d+),(\d+),(\d+)\),.+,<(\d+),(\d+),(\d+),(\d+),(\d+)>\]$/;
					if ($id1 == $1 || $id1 == $4 || $id2 == $1 || $id2 == $4)
					{
						$found = 1; 
						$flag = 1; 
					}
				}
				if ($flag == 1)
				{
					#print "rec chosen: $rec1\n"; 
					push @sheet, $rec1; 
				}
				else
				{
					#print "not chosen: $rec1\n"; 
					push @temp_pairs, $rec1; 
				}
			}

			if ($found == 0 && @sheet > 0)
			{
				#put one sheet into a list
				#print "push ...., @sheet\n"; 
				push @psheets, join("*",@sheet); 
				last; 
			}
			@pre_pairs = @temp_pairs; 
		}
		if (@pre_pairs == 0 && @sheet > 0)
		{
			#print "push ...., @sheet\n"; 
			push @psheets, join("*",@sheet); 
		}
	}

	#print out the true sheets
	$psheet_num = @psheets;
	#print "****predicted sheet number: $psheet_num\n"; 
	#for($r = 0; $r < @psheets; $r++)
	#{
	#	print "$psheets[$r]"; 
	#	print "\n"; 
	#}
	push @true_sheet_num,$tsheet_num;
	push @pre_sheet_num, $psheet_num; 
}



print OUTPUT "\n#predicted pair num VS true pair num per chain\n";
print OUTPUT "pre_num <- c(", join(",",@pre_pair_num), ")\n";
print OUTPUT "true_num <- c(", join(",", @true_pair_num), ")\n"; 

print "pair precision: ", $corr_total / $pre_total, "\n"; 
print "pair recall: ", $corr_total / $true_total, "\n"; 

print "naive pair precision: ", $naive_corr_total / $naive_pre_total, "\n";
print "naive pair recall: ", $naive_corr_total / $true_total, "\n"; 

print "accuracy of strand pairing( (TP+TN) /(TP+TN+FP+FN) (: ", ($tp+$tn) / ($tp+$tn+$fp+$fn), "\n"; 


#check some invariance
if ($true_total != $t_brig + $t_anti + $t_para)
{
	die "total number of true pair is not consistent.\n"; 
}

print "pairing direction accuracy:", ($c_brig + $c_anti + $c_para) / $corr_total, "\n"; 
print "proportion of bridge, antiparallel, parallel among all correct: ", $c_brig / $corr_total, ",", $c_anti / $corr_total, ",", $c_para / $corr_total, "\n"; 
$rel_all = $c_anti + $c_para + $c_brig;  
print "relative proportion of bridge, antiparallel, parallel: ", $c_brig / $rel_all, ",", $c_anti / $rel_all, ",", $c_para / $rel_all, "\n"; 
print "relative proportion of antiparallel and parallel: ", ($c_anti)/($c_anti+$c_para), ",", ($c_para)/($c_anti+$c_para),"\n";

print OUTPUT "alignment distance for correctly predicted bridge, antiparallel and parallel:\n"; 
print OUTPUT "cbrig.dist <- c(", join(",",@brig_dist), ")\n";
print OUTPUT "canti.dist <- c(", join(",", @anti_dist), ")\n"; 
print OUTPUT "cpara.dist <- c(", join(",", @para_dist), ")\n"; 
$size1 = @brig_dist;
$size2 = @anti_dist;
$size3 = @para_dist;  
$size = $size1 + $size2 + $size3; 
print "alignment accuracy for correctly predicted pairs and direction:", $corr_ali_num / $size, "\n"; 

print OUTPUT "alignment distance for all true bridge, antiparallel and parallel:\n"; 
print OUTPUT "abrig.dist <- c(", join(",",@abrig_dist), ")\n";
print OUTPUT "aanti.dist <- c(", join(",", @aanti_dist), ")\n"; 
print OUTPUT "apara.dist <- c(", join(",", @apara_dist), ")\n"; 
$size1 = @abrig_dist;
$size2 = @aanti_dist;
$size3 = @apara_dist;  
$size = $size1 + $size2 + $size3; 
print "alignment accuracy for all true pairs and assuming direction is known:", $acorr_ali_num / $size, "\n"; 

#print out the sheet number
print OUTPUT "tsheet.num <-c(", join(",", @true_sheet_num), ")\n";
print OUTPUT "psheet.num <-c(", join(",", @pre_sheet_num), ")\n"; 
close OUTPUT; 











