#!/usr/bin/perl -w

###########################################################################################################
#Check the chains in 9-line datasets and output following information in a table
#output format 1: 
# Name	Length	Strand_Length	Strand_Percentage	H-bond num	Percentage_with_bonds  Consistency_NOtes
#output 2: selected chains (9-line format)
#Input: 9-line dataset
#output: statistics table, selected dataset
#Author: Jianlin Cheng
#Date: 9/1/2004
###########################################################################################################
if (@ARGV != 8)
{
	die "need 8 parameters: input dataset, output statistics file, output dataset, min strand num(10), max strand num(150, or 100), min strand percentage(0.1), min h-bond percentage(0.8, 0.85, 0.9), maximum sequnce length(250, 600).\n"; 
}

$input = shift @ARGV;
$stat = shift @ARGV;
$output = shift @ARGV;
$min_strand_num = shift @ARGV;
$max_strand_num = shift @ARGV;
$min_strand_per = shift @ARGV;
$min_hbond_per = shift @ARGV;
$max_seq_length = shift @ARGV; 

open(INPUT, "$input") || die "can't open input file\n";
open(STAT, ">$stat");
open(OUTPUT, ">$output"); 
@content = <INPUT>;
close INPUT; 

# Name	Length	Strand_Length	Strand_Percentage	H-bond num	Percentage_with_bonds  Consistency_Notes
print STAT "name\tlen\ts_len\ts_percent\th_bond_num\th_bond_percent\tcon_notes\n"; 

$count = 0; 
while(@content)
{
	$name = shift @content; 
	chomp $name; 
	$len = shift @content;
	chomp $len; 
	$seq = shift @content; 
	chomp $seq; 
	$ss = shift @content;
	chomp $ss; 
	$bs1 = shift @content; 
	chomp $bs1; 
	$bs2 = shift @content; 
	chomp $bs2; 
	$sa = shift @content;
	chomp $sa; 
	$xyz = shift @content; 
	chomp $xyz; 
	shift @content; 

	@sec = split(/\s+/, $ss); 
	@hb1 = split(/\s+/, $bs1);
	@hb2 = split(/\s+/, $bs2);
	if ($len != @sec || $len != @hb1 || $len != @hb2)
	{
		die "$name, length doesn't match with length of ss, bs1, bs2\n"; 
	}
	$s_num = 0;
	$h_bond_num = 0; 
	$a_num = 0; 
	$no_num = 0; 
	$note = ""; 
	$error = 0; 
	for ($i = 0; $i < $len; $i++)
	{
		if ($sec[$i] eq "E" || $sec[$i] eq "B")
		{
			$s_num++; 
			if ($hb1[$i] > 0)
			{
				$h_bond_num++; 
			}
			if ($hb2[$i] > 0)
			{
				$h_bond_num++; 
			}
			$index = $i + 1; 
			$bp1 = $hb1[$i];
			$bp2 = $hb2[$i];
			if ($bp1 > 0 || $bp2 > 0)
			{
				$a_num++; 
			}

			#consistency checking
			if ($bp1 <= 0 && $bp2 <= 0)
			{
				#$note .= "$index no h-bond, "; 	
				$no_num++; 
			}
			if ($bp1 < 0 || $bp1 > $len)
			{
				$note .= "$bp1 of $index out of bounds, "; 
				$error = 1; 
			}
			if ($bp2 < 0 || $bp2 > $len)
			{
				$note .= "$bp2 of $index out of bounds, "; 
				$error = 1; 
			}
			if ($bp1 == $bp2 && $bp1 > 0)
			{
				$note .= "$index: bp1($bp1)=bp2($bp2), "; 
				$error = 1; 
			}
			#cross checking
			if ($bp1 > 0 && $bp1 <= $len)
			{
				if ($index != $hb1[$bp1-1] && $index != $hb2[$bp1-1])
				{
					$note .= "$index: no $bp1 -> $index, "; 
					$error = 1; 
				}
			}
			if ($bp2 > 0 && $bp2 <= $len)
			{
				if ($index != $hb2[$bp2-1] && $index != $hb1[$bp2-1] )
				{
					$note .= "$index: no $bp2 -> $index, "; 
					$error = 1; 
				}
			}
		}
	}
	if ($no_num > 0)
	{
		$note .= "$no_num aa have no h-bonds"; 
	}
	if ($note eq "")
	{
		$note = "ok"; 
	}
	$percent1 = $s_num / $len; 
	$percent1 = substr($percent1, 0, 5); 
	$percent2 = 0;
	if ($s_num > 0)
	{
		$percent2 = $a_num / $s_num; 
		$percent2 = substr($percent2, 0, 5); 
	}
	
	print STAT "$name\t$len\t$s_num\t", $percent1, "\t\t", $h_bond_num, "\t\t", $percent2, "\t", "$note\n"; 
	if ($s_num >= $min_strand_num && $s_num <= $max_strand_num && $percent1 >= $min_strand_per && $percent2 >= $min_hbond_per && $error == 0 && $len <= $max_seq_length)
	{
		$count++; 
		print OUTPUT "$name\n$len\n$seq\n$ss\n$bs1\n$bs2\n$sa\n$xyz\n\n"; 
	}

}
close STAT; 
close OUTPUT; 
print "number of selected sequences: $count\n"; 





