#!/usr/bin/perl -w
###############################################################################
# Name %n  : create-features.pl
###############################################################################

sub getlib {
  my $GLOBAL_PATH='/home/casp11/casp12/tools/dncon1.1/';
  my $libdir = $GLOBAL_PATH.'libs/perl';
  return $libdir;
}

use lib getlib();
use ProteinUtils qw(estimate_joint_entropy calc_entropy atchley_factor scaled_log_odds_n_inf scld_residue_residue_contacts scld_lu_contact_potential scld_levitt_contact_potential scld_braun_contact_potential scaled_ordered_mean pssm_counts);
use MyMathUtils qw(calc_cosine calc_pearson_r);
use strict;

use constant MIN_SEP => "12";
use constant MAX_SEP => "120";


if (@ARGV ne 6 ) {
  print "Usage: $0 <fasta> <pssm> <ss_sa> <dist> <threshold> <window_size(odd number)>\n";
  print "   Ex: $0 1but.fasta 1but.pssm 1but.ss_sa 1but.dist 8 15\n";
  exit;
}

my $fasta_fname = $ARGV[0];
my $pssm_fname = $ARGV[1];
my $ss_sa_fname = $ARGV[2];
my $dist_fname = $ARGV[3];
my $thresh = $ARGV[4];
my $window_size = $ARGV[5];

my $id = $fasta_fname;
$id =~ s/\..*//;
$id =~ s/^.*\///;

open FASTA, "<" . $fasta_fname || die "Couldn't open fasta file\n";
my @lines = <FASTA>;
chomp(@lines);
close FASTA;

shift @lines;
my $seq = join('', @lines);
$seq =~ s/ //g;
my @seq = split(//, $seq);

# Calculate scalars and acthley factors
my $aas = "ACDEFGHIKLMNPQRSTVWY";
my @scalar = ();
my @offset = ();
my @res = split(//, $aas);

for(my $i = 1; $i <= 5; $i++) {
  my $min = 15;
  my $max = - 15;
  for(my $j = 0; $j < @res; $j++) {

    if(atchley_factor($res[$j],$i) < $min) {
      $min = atchley_factor($res[$j], $i);
    }
    if(atchley_factor($res[$j],$i) > $max) {
      $max = atchley_factor($res[$j], $i);
    }

  }

  $scalar[$i] = (1/($max-$min));
  $offset[$i] = ($min/($min-$max));

}

my @factors = ();
for (my $i = 0; $i < @seq; $i++) {
  $factors[$i] = [];
  unshift(@{$factors[$i]}, 0);
}

# Get all factors, once
for (my $i = 0; $i < @seq; $i++) {
   for(my $j = 1; $j <= 5; $j++) {
    $factors[$i][$j] = atchley_factor($seq[$i], $j) * $scalar[$j] + $offset[$j];
   }
}

my $seq_len = length($seq);

# Determine composition of protein by amino acid type
my @comp_counts = ();
for(my $i = 0; $i < length($aas); $i++) {
  $comp_counts[$i] = 0;
}
for(my $i = 0; $i < @seq; $i++) {
  $comp_counts[index($aas, $seq[$i])]++;
}
for(my $i = 0; $i < length($aas); $i++) {
  $comp_counts[$i] = $comp_counts[$i]/$seq_len;
}

open PSSM, "<" . $pssm_fname || die "Couldn't open pssm file\n";
@lines = <PSSM>;
chomp(@lines);
close PSSM;

my ($odds_ref, $inf_ref) = scaled_log_odds_n_inf(@lines);
my @odds = @{$odds_ref};
my @inf = @{$inf_ref};
my @joint_entropy = estimate_joint_entropy(@lines);
my @pssm_counts = pssm_counts(@lines);

my @pssm_sums = ();
for(my $i = 0; $i < @pssm_counts; $i++) {
  my $sum = 0; 
  for(my $j = 0; $j < @{$pssm_counts[$i]}; $j++) {
    $sum += $pssm_counts[$i][$j];
  }
  $pssm_sums[$i] = $sum;
}

unshift(@odds, "");
unshift(@inf, "");
unshift(@joint_entropy, "");
unshift(@pssm_counts, "");
unshift(@pssm_sums, "");

open SS_SA, "<" . $ss_sa_fname || die "Couldn't open ss_sa file\n";
my @ss_sa=<SS_SA>;
chomp @ss_sa;
my @ss = split(//, $ss_sa[2]);
my @sa = split(//, $ss_sa[3]);
close SS_SA;

my $beta_count = 0; my $alpha_count = 0;
foreach my $ss_t (@ss) {
  if($ss_t eq 'E') {
    $beta_count++;
  } 
  if($ss_t eq 'H') {
    $alpha_count++;
  }
}

my $exposed_count = 0;
foreach my $sa_t (@sa) {
  if($sa_t eq 'b') {
    $exposed_count++;
  } 
}

unshift(@factors,"");
unshift(@seq,"");
unshift(@ss,"");
unshift(@sa,"");

open DIST, "<" . $dist_fname || die "Couldn't open distance file\n";
@lines = <DIST>;
chomp(@lines);
close DIST;

my @distances = ();
for(my $i = 0; $i <= $seq_len; $i++) {
  $distances[$i] = ();
  for(my $j = 0; $j <= $seq_len; $j++) {
    $distances[$i][$j] = 100;
  }
}

foreach my $line_t (@lines) {
  my ($i, $j, $distance) = split(/ /, $line_t);
  $distances[$i][$j] = $distance;
  $distances[$j][$i] = $distance;
}

for(my $i = 1 + int($window_size/2); $i < $seq_len - int($window_size/2) - MIN_SEP - 2; $i++) { 
  for(my $j = $i + MIN_SEP; $j < $i + MAX_SEP && $j < $seq_len-int($window_size/2); $j++) {

    my $class = "0";
    if($distances[$i][$j] < $thresh) {
      $class = "1";
    }

    print "# $class, $id $i <-> $j, $i $j, " . ($j - $i) . ", $seq_len\n";
  
    my @feature = ();
    # Add features for win 1
    for(my $k = -1 * int($window_size/2); $k < int($window_size/2); $k++) {
 
      for(my $factor = 1; $factor <= 5; $factor++) {
        push @feature, sprintf("%.2f", $factors[$i+$k][$factor]);
      }

      if ($ss[$i+$k]  eq "H") {
        push @feature, 1;
        push @feature, 0;       
        push @feature, 0;       
      } elsif ($ss[$i+$k] eq "E") {
        push @feature, 0;
        push @feature, 1;       
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 0;       
        push @feature, 1;       
      }

      if ($sa[$i+$k] eq "e") {
        push @feature, 1;
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 1;       
      }

      # PSSM features
      push @feature, $inf[$i+$k];
      for(my $l = 0; $l < @{$odds[$i+$k]}; $l++) {
        push @feature, $odds[$i+$k][$l];
      }

    }
    # Add features for win 2
    for(my $k = -1 * int($window_size/2); $k < int($window_size/2); $k++) {

      for(my $factor = 1; $factor <= 5; $factor++) {
        push @feature, sprintf("%.2f", $factors[$j+$k][$factor]);
      }

      if ($ss[$j+$k]  eq "H") {
        push @feature, 1;
        push @feature, 0;       
        push @feature, 0;       
      } elsif ($ss[$j+$k] eq "E") {
        push @feature, 0;
        push @feature, 1;       
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 0;       
        push @feature, 1;       
      }

      if ($sa[$j+$k] eq "e") {
        push @feature, 1;
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 1;       
      }

      # PSSM features
      push @feature, $inf[$j+$k];
      for(my $l = 0; $l < @{$odds[$j+$k]}; $l++) {
        push @feature, $odds[$j+$k][$l];
      }

    }

    # Add features for small window at midpoint
    for(my $k = -2; $k <3; $k++) {
      for(my $factor = 1; $factor <= 5; $factor++) {
        push @feature, sprintf("%.2f", $factors[int(($i+$j)/2)+$k][$factor]);
      }

      if ($ss[int(($i+$j)/2)+$k]  eq "H") {
        push @feature, 1;
        push @feature, 0;       
        push @feature, 0;       
      } elsif ($ss[int(($i+$j)/2)+$k] eq "E") {
        push @feature, 0;
        push @feature, 1;       
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 0;       
        push @feature, 1;       
      }

      if ($sa[int(($i+$j)/2)+$k] eq "e") {
        push @feature, 1;
        push @feature, 0;       
      } else {
        push @feature, 0;
        push @feature, 1;       
      }

      # PSSM features
      push @feature, $inf[int(($i+$j)/2)+$k];
      for(my $l = 0; $l < @{$odds[int(($i+$j)/2)+$k]}; $l++) {
        push @feature, $odds[int(($i+$j)/2)+$k][$l];
      }

    }

    # Pair wise features
    push @feature, (sprintf "%.2f", scld_residue_residue_contacts($seq[$i], $seq[$j]));
    push @feature, (sprintf "%.2f", scld_lu_contact_potential($seq[$i], $seq[$j]));
    push @feature, (sprintf "%.2f", scld_levitt_contact_potential($seq[$i], $seq[$j]));
    push @feature, (sprintf "%.2f", scld_braun_contact_potential($seq[$i], $seq[$j]));
    push @feature, (sprintf "%.2f", $joint_entropy[$i][$j]);

    push @feature, (sprintf "%.2f", (calc_pearson_r($pssm_counts[$i],$pssm_counts[$j])+1)/2.0);

    # Calculate the cosine between angles but since some counts in pssm are missing, check
    # and add two feature switch to use the cosine value or not
    if($pssm_sums[$i] > 0 && $pssm_sums[$j] > 0) {
      push @feature, 0;
      push @feature, 1;
      push @feature, (sprintf "%.2f", (calc_cosine($pssm_counts[$i], $pssm_counts[$j])));
    } else {
      push @feature, 1;
      push @feature, 0;
      push @feature, 0;
    
    }

    # Calculate order weights
    for(my $order = 0; $order < 4; $order++) {
      push @feature, (sprintf "%.2f", scaled_ordered_mean(\@ss, $i, $j, $order, "H"));
      push @feature, (sprintf "%.2f", scaled_ordered_mean(\@ss, $i, $j, $order, "E"));
      push @feature, (sprintf "%.2f", scaled_ordered_mean(\@ss, $i, $j, $order, "C"));
      push @feature, (sprintf "%.2f", scaled_ordered_mean(\@sa, $i, $j, $order, "e"));
      push @feature, (sprintf "%.2f", scaled_ordered_mean(\@sa, $i, $j, $order, "b"));
    }


    # Global features
    push @feature, (sprintf "%.2f", (($j - $i)/$seq_len));
    push @feature, (sprintf "%.2f", $exposed_count/$seq_len);
    push @feature, (sprintf "%.2f", $alpha_count/$seq_len);
    push @feature, (sprintf "%.2f", $beta_count/$seq_len);  
    
    my @bins = ();
    for(my $i = 0; $i < 11; $i++) {
      $bins[$i] = 0;
    }
    if(($j-$i) <= 12) {
       $bins[0] = 1;
    } elsif (($j-$i) <= 18) {
       $bins[1] = 1;
    } elsif (($j-$i) <= 26) {
       $bins[2] = 1;
    } elsif (($j-$i) <= 38) {
       $bins[3] = 1;
    } elsif (($j-$i) <= 50) {
       $bins[4] = 1;
    } elsif (($j-$i) <= 62) {
       $bins[5] = 1;
    } elsif (($j-$i) <= 74) {
       $bins[6] = 1;
    } elsif (($j-$i) <= 86) {
       $bins[7] = 1;
    } elsif (($j-$i) <= 98) {
       $bins[8] = 1;
    } elsif (($j-$i) <= 110) {
       $bins[9] = 1;
    } else  {
       $bins[10] = 1;
    } 
    push @feature, @bins;

    if($seq_len < 75) { 
      push @feature, 1; push @feature, 0; push @feature, 0; push @feature, 0;
    } elsif ($seq_len < 150) {
      push @feature, 0; push @feature, 1; push @feature, 0; push @feature, 0;
    } elsif ($seq_len < 225) {
      push @feature, 0; push @feature, 0; push @feature, 1; push @feature, 0;
    } else {
      push @feature, 0; push @feature, 0; push @feature, 0; push @feature, 1;
    }

    for(my $i = 0; $i < length($aas); $i++) {
      push @feature, (sprintf("%.2f", $comp_counts[$i]));
    }

    print "$class";
    # Spit out features
    for(my $k = 0; $k < @feature; $k++) {
      if($feature[$k] > 1) {
        print " 1";
      } else {
        print " " . $feature[$k];
      }
    }
    print "\n";

  }
}


  

