#!/usr/bin/perl -w

###########################################################################################################
#Check the chains in 9-line datasets and output following information in a table
#output format 1: 
# Name	Length	Strand_Length	Strand_Percentage	H-bond num	Percentage_with_bonds  Consistency_NOtes
#Input: 9-line dataset
#output: statistics table
#Author: Jianlin Cheng
#Date: 9/1/2004
#Addition: output the number of amino acid in strand and number of h-bonds, which is used to do regression to estimate the number of h-bonds from number of strand. 9/14/2004 
###########################################################################################################
if (@ARGV != 2)
{
	die "need 2 parameters: input dataset, output statistics file\n"; 
}

$input = shift @ARGV;
$stat = shift @ARGV;

open(INPUT, "$input") || die "can't open input file\n";
open(STAT, ">$stat");
@content = <INPUT>;
close INPUT; 

# Name	Length	Strand_Length	Strand_Percentage	H-bond num	Percentage_with_bonds  Consistency_Notes
print STAT "name\tlen\ts_len\ts_percent\th_bond_num\th_bond_percent\tcon_notes\n"; 

#linear distance distribution of h-bond 
@bond_dist = ();
for ($i = 0; $i < 500; $i++)
{
	$bond_dist[$i] = 0; 
}

@s_num_vec = ();
@h_num_vec = (); 

while(@content)
{
	$name = shift @content; 
	chomp $name; 
	$len = shift @content;
	chomp $len; 
	$seq = shift @content; 
	chomp $seq; 
	$ss = shift @content;
	chomp $ss; 
	$bs1 = shift @content; 
	chomp $bs1; 
	$bs2 = shift @content; 
	chomp $bs2; 
	$sa = shift @content;
	chomp $sa; 
	$xyz = shift @content; 
	chomp $xyz; 
	shift @content; 

	@sec = split(/\s+/, $ss); 
	@hb1 = split(/\s+/, $bs1);
	@hb2 = split(/\s+/, $bs2);
	if ($len != @sec || $len != @hb1 || $len != @hb2)
	{
		die "$name, length doesn't match with length of ss, bs1, bs2\n"; 
	}
	$s_num = 0;
	$h_bond_num = 0; 
	$a_num = 0; 
	$no_num = 0; 
	$note = ""; 
	$error = 0; 
	for ($i = 0; $i < $len; $i++)
	{
		if ($sec[$i] eq "E" || $sec[$i] eq "B")
		{
			$s_num++; 
			if ($hb1[$i] > 0)
			{
				$h_bond_num++; 
			}
			if ($hb2[$i] > 0)
			{
				$h_bond_num++; 
			}
			$index = $i + 1; 
			$bp1 = $hb1[$i];
			$bp2 = $hb2[$i];
			if ($bp1 > 0 || $bp2 > 0)
			{
				$a_num++; 
			}

			#consistency checking
			if ($bp1 <= 0 && $bp2 <= 0)
			{
				#$note .= "$index no h-bond, "; 	
				$no_num++; 
			}
			if ($bp1 < 0 || $bp1 > $len)
			{
				$note .= "$bp1 of $index out of bounds, "; 
				$error = 1; 
			}
			if ($bp2 < 0 || $bp2 > $len)
			{
				$note .= "$bp2 of $index out of bounds, "; 
				$error = 1; 
			}
			if ($bp1 == $bp2 && $bp1 > 0)
			{
				$note .= "$index: bp1($bp1)=bp2($bp2), "; 
				$error = 1; 
			}
			#cross checking
			if ($bp1 > 0 && $bp1 <= $len)
			{
				if ($index != $hb1[$bp1-1] && $index != $hb2[$bp1-1])
				{
					$note .= "$index: no $bp1 -> $index, "; 
					$error = 1; 
				}
			}
			if ($bp2 > 0 && $bp2 <= $len)
			{
				if ($index != $hb2[$bp2-1] && $index != $hb1[$bp2-1] )
				{
					$note .= "$index: no $bp2 -> $index, "; 
					$error = 1; 
				}
			}
			if ($error != 1)
			{
				if ($bp1 > 0)
				{
					$separation = abs($bp1 - $index); 
					$bond_dist[$separation - 1]++; 
				}
				if ($bp2 > 0)
				{
					$separation = abs($bp2 - $index); 
					$bond_dist[$separation - 1]++; 
				}
			}
		}
	}
	if ($no_num > 0)
	{
		$note .= "$no_num aa have no h-bonds"; 
	}
	if ($note eq "")
	{
		$note = "ok"; 
	}
	$percent1 = $s_num / $len; 
	$percent1 = substr($percent1, 0, 5); 
	$percent2 = 0;
	if ($s_num > 0)
	{
		$percent2 = $a_num / $s_num; 
		$percent2 = substr($percent2, 0, 5); 
	}

	print STAT "$name\t$len\t$s_num\t", $percent1, "\t\t", $h_bond_num, "\t\t", $percent2, "\t", "$note\n"; 
	push @s_num_vec, $s_num;
	push @h_num_vec, $h_bond_num; 
}

#print out the distance distribution of h-bond
print STAT "\nH-bond distance distribution:\n"; 
for ($i = 0; $i < 500; $i++)
{
	print STAT $i+1 , "	", $bond_dist[$i], "\n"; 
}

print STAT "\n****************************************************\n"; 
print STAT "\nTable of AA num in beta-strand and H-bond num\n"; 
print STAT "\nAANum    BondNum\n"; 

for ($i = 0; $i < @s_num_vec; $i++)
{
	print STAT "$s_num_vec[$i]       $h_num_vec[$i]\n"; 
}


close STAT; 





