#!/usr/bin/perl -w
##########################################################################
#generate beta-residue, strand, sheet statistics including: 
# total number of residues, total number of beta residues, total number of strands, total number of strand pair, total number of partners
#strand length, the distance between pair strands, distance between pair residue
#beta-sheet nubmer? (not count, may do it in C++ code)
#
# Modifided from beta_stat.pl: add: count the number of parters for each strand
#and the length of itself and its parter

#Addition: count the percentage of strands with beta-buldge

#Author: Jianlin Cheng
#Date: Dec. 08, 2004
##########################################################################

if (@ARGV != 2)
{
	die "need two parameters: input data set file(9-line with title), output file.\n"; 
}

$input_file = shift @ARGV;
$output_file = shift @ARGV;

open(INPUT, "$input_file") || die "can't open input file.\n";
open(OUTPUT, ">$output_file") || die "can't create output file.\n";

@content = <INPUT>;
close INPUT; 
shift @content;

$tot_aa = 0;
$tot_res = 0;
$tot_strand = 0;
$buldge_strand_pair = 0; 
$buldge1 = 0;
$buldge2 = 0; 
$buldge_max = 0; 
$tot_partners = 0;
$tot_strand_pair = 0; 

@strand_dist = ();
@res_dist = (); 

@strand_num = ();
@strand_pair_num = ();
@res_num = ();
@res_part_num = (); 
@strand_length = (); 

#strand pair statistics, for each strand: (partner_num, length, partner1_len, partner2_len,...)
#format: two dimension array: row (strand index, start from 0), col1(length),col2(partner num),col3(length of partner1)...)
@pair_stat = (); 

$strand_idx = 0;
while(@content)
{
	$name = shift @content;
	$length = shift @content;
	chomp $length; 
	$seq = shift @content;
	chomp $seq; 
	@seq = split(/\s+/, $seq); 
	$ss = shift @content;
	chomp $ss; 
	@ss = split(/\s+/, $ss); 
	$bp1 = shift @content;
	chomp $bp1; 
	@bp1 = split(/\s+/, $bp1); 
	$bp2 = shift @content;
	chomp $bp2; 
	@bp2 = split(/\s+/, $bp2); 
	shift @content;
	shift @content;
	shift @content; 

	#error check
	if ($length != @seq || $length != @ss || $length != @bp1 || $length != @bp2 )
	{
		die "length not match.\n"; 
	}
	$tot_aa += $length; 

	$seq_res_num = 0; 
	$seq_part_num = 0; 

	$in_strand = 0; 
	@strand_start = ();
	@strand_end = ();
	for ($i = 0; $i < $length; $i++)
	{
		$sec = $ss[$i]; 

		#count partner number and distance
		if ($bp1[$i] > 0 && ($sec eq "E" || $sec eq "B") )
		{
			$tot_partners++; 
			$seq_part_num++; 
			push @res_dist, abs($bp1[$i] - $i - 1); 
		}
		if ($bp2[$i] > 0 && ($sec eq "E" || $sec eq "B"))
		{
			$tot_partners++; 
			$seq_part_num++; 
			push @res_dist, abs($bp2[$i] - $i - 1); 
		}

		#generate strand
		if ( $sec eq "E" || $sec eq "B")
		{
			$tot_res++; 
			$seq_res_num++; 
			if ($in_strand == 0)
			{
				push @strand_start, $i; 
				$in_strand = 1; 
			}

			if ($i == $length - 1) #at the end, just in case, shouldn't happen. 
			{
				push @strand_end, $i; 
			}
		}
		else
		{
			if ($in_strand == 1)
			{
				push @strand_end, $i - 1; 
				$in_strand = 0; 
			}
		}
	}
	push @res_num, $seq_res_num; 
	#number of beta-residue pairs are counted twice
	push @res_part_num, $seq_part_num / 2; 
	$st_num = @strand_start; 
	if ($st_num != @strand_end)
	{
		die "size of strand_start doesn't equal size of strand_end: $name"; 
	}
	push @strand_num, $st_num; 
	for ($i = 0; $i < @strand_start; $i++)
	{
		#generate strand length
		push @strand_length, $strand_end[$i] - $strand_start[$i] + 1; 
	}

	#cout strand pair number and distance
	$size = @strand_start;
	$tot_strand += $size; 
	$st_bind_num = 0; 
	for ($i = 0; $i < $size; $i++)
	{
		$a_start = $strand_start[$i];
		$a_end = $strand_end[$i]; 
		for ($j = $i+1; $j < $size; $j++)
		{
			$b_start = $strand_start[$j]; 
			$b_end = $strand_end[$j]; 
			#check if two strands are paired (partner index start from 1, so add 1)
			$range1 = $b_start + 1; 
			$range2 = $b_end + 1; 
			$paired = 0; 
			for ($k = $a_start; $k <= $a_end; $k++)
			{
				if ( ($range1 <= $bp1[$k] && $bp1[$k] <= $range2) || ($range1 <= $bp2[$k] && $bp2[$k] <= $range2) )
				{
					$paired = 1; 
				}
				
			}
			if ($paired == 1)
			{
				$tot_strand_pair++; 
				$st_bind_num++; 
				#calculate the distance between two strands
				$sdist = $strand_start[$j] - $strand_end[$i]; 
				push @strand_dist, $sdist; 

				#compute overlapping
				$start_aa = -1; 
				$start_bb = -1;
				$end_aa = -1;
				$end_bb = -1; 
				
				for ($k = $a_start; $k <= $a_end; $k++)
				{
					if ( ($range1 <= $bp1[$k] && $bp1[$k] <= $range2) )
					{
						if ($start_aa == -1)
						{
							$start_aa = $k; 
							$start_bb = $bp1[$k]; 
						}
						$end_aa = $k; 
						$end_bb = $bp1[$k]; 
					}
					elsif ($range1 <= $bp2[$k] && $bp2[$k] <= $range2 )
					{
						if ($start_aa == -1)
						{
							$start_aa = $k; 
							$start_bb = $bp2[$k]; 
						}
						$end_aa = $k; 
						$end_bb = $bp2[$k]; 
					}
				
				}
		
				#count number of buldge strand pair
				if ( abs($end_aa -$start_aa) != abs($end_bb - $start_bb) )
				{
					$buldge_strand_pair++; 
				}
				if ( abs(abs($end_aa -$start_aa) - abs($end_bb - $start_bb)) > 1 )
				{
					$buldge2++; 
				}
				if ( abs(abs($end_aa -$start_aa) - abs($end_bb - $start_bb)) > $buldge_max )
				{
					$buldge_max = abs(abs($end_aa -$start_aa) - abs($end_bb - $start_bb)); 
				}
			}
		}
	}
	push @strand_pair_num, $st_bind_num; 

	#count the partner number of beta-strands
	for ($i = 0; $i < $size; $i++)
	{
		$a_start = $strand_start[$i];
		$a_end = $strand_end[$i]; 
		$pair_stat[$strand_idx][0] = $a_end - $a_start + 1; 
		#count number of partners
		$partner_num = 0; 
		@partner_len = (); 	
		for ($j = 0; $j < $size; $j++)
		{
			$b_start = $strand_start[$j]; 
			$b_end = $strand_end[$j]; 
			#check if two strands are paired (partner index start from 1, so add 1)
			$range1 = $b_start + 1; 
			$range2 = $b_end + 1; 
			$paired = 0; 
			for ($k = $a_start; $k <= $a_end; $k++)
			{
				if ( ($range1 <= $bp1[$k] && $bp1[$k] <= $range2) || ($range1 <= $bp2[$k] && $bp2[$k] <= $range2) )
				{
					$paired = 1; 
				}
				
			}
			if ($paired == 1)
			{
				$partner_num++; 
				push @partner_len, $b_end - $b_start + 1; 
			}
		}
		$pair_stat[$strand_idx][1] = $partner_num;
		for ($m = 0; $m < $partner_num; $m++)
		{
			$pair_stat[$strand_idx][2+$m] = $partner_len[$m];
		}
		$strand_idx++; 
	}
}

$tot_partners /= 2; 

#output statistics


print OUTPUT "tot_aa = $tot_aa\n";
print OUTPUT "tot_beta_res = $tot_res\n";
print OUTPUT "tot_strand_num = $tot_strand\n";
print OUTPUT "tot_beta_pair = $tot_partners\n";
print OUTPUT "tot_strand_pair = $tot_strand_pair\n";
print OUTPUT "strand_pair_dist = c(", join(",", @strand_dist), ")\n";
print OUTPUT "beta_res_dist = c(", join(",", @res_dist), ")\n"; 
print OUTPUT "strand_num = c(", join(",", @strand_num), ")\n";
print OUTPUT "strand_pair_num = c(", join(",", @strand_pair_num), ")\n";
print OUTPUT "beta_res_num = c(", join(",", @res_num), ")\n";
print OUTPUT "res_partner_num = c(", join(",", @res_part_num), ")\n";
print OUTPUT "strand_length = c(", join(",", @strand_length), ")\n"; 

#output the strand pairing statistics
print OUTPUT "Strand pairing stats:\n"; 

#summary stats for parter num: 0,1,2,3,4+
for ($i = 0; $i <= 4; $i++)
{
	#count
	$psum[$i][0] = 0; 
	#min
	$psum[$i][1] = 1000; 
	#ave
	$psum[$i][2] = 0; 
	#max
	$psum[$i][3] = 0; 
}

for ($i = 0; $i < $strand_idx; $i++)
{
	$len = $pair_stat[$i][0];
	$num = $pair_stat[$i][1]; 
	print OUTPUT "$len, $num";

	$idx = $num; 
	if ($idx > 4)
	{
		$idx = 4; 
	}
	$psum[$idx][0]++; 
	if ($len < $psum[$idx][1])
	{
		$psum[$idx][1] = $len; 
	}
	$psum[$idx][2] += $len;
	if ($len > $psum[$idx][3])
	{
		$psum[$idx][3] = $len; 
	}

	for ($j = 0; $j < $num; $j++)
	{
		print OUTPUT ", $pair_stat[$i][2+$j]"; 
	}
	print OUTPUT "\n"; 
}
for ($i = 0; $i <= 4; $i++)
{
	$psum[$i][2] /= $psum[$i][0]; 
}
print OUTPUT "\nsummary of pair num 0,1,2,3,4+ (num,min,ave,max):\n"; 
for ($i = 0; $i <= 4; $i++)
{
	print OUTPUT $i," ",  $psum[$i][0]," ",  $psum[$i][1]," ", $psum[$i][2]," ", $psum[$i][3], "\n"; 
}

close OUTPUT; 

print "\n statistics of beta-buldge:\n";
print "total pair: $tot_strand_pair, buldge_pair: $buldge_strand_pair, percentage: ", $buldge_strand_pair / $tot_strand_pair, "\n"; 
print "buldge with 2 aa: ", $buldge2 / $tot_strand_pair, "\n"; 
print "max buldge: ", $buldge_max, "\n"; 
