#!/usr/bin/perl -w
###############################################################################
# Name %n  : create-short-target-features.pl
###############################################################################
sub getlib {
  my $GLOBAL_PATH='/home/casp11/casp12/tools/dncon1.1/';
  my $libdir = $GLOBAL_PATH.'libs/perl';
  return $libdir;
}

use lib getlib();

use CMindexer qw(create_res_idx_map);
use ProteinUtils qw(calc_entropy atchley_factor scaled_log_odds_n_inf scld_residue_residue_contacts scld_lu_contact_potential scld_levitt_contact_potential scld_braun_contact_potential);
use strict;

use constant MIN_SEP => "6";
use constant MAX_SEP => "12";

if (@ARGV ne 8 ) {
  print "Usage: $0 <fasta> <pssm> <ss_sa> <dist> <threshold> <feat.txt> <targets.txt> <window_size>\n";
  print "   Ex: $0 1but.fasta 1but.pssm 1but.ss_sa 1but.dist 8 feat.txt targets.txt 16\n";
  exit;
}

my $fasta_fname = $ARGV[0];
my $pssm_fname = $ARGV[1];
my $ss_sa_fname = $ARGV[2];
my $dist_fname = $ARGV[3];
my $thresh = $ARGV[4];
my $feat_fname = $ARGV[5];
my $targ_fname = $ARGV[6];
my $window_size = $ARGV[7];

my $id = $fasta_fname;
$id =~ s/\..*//;
$id =~ s/^.*\///;

open FASTA, "<" . $fasta_fname || die "Couldn't open fasta file\n";
my @lines = <FASTA>;
chomp(@lines);
close FASTA;

shift @lines;
my $seq = join('', @lines);
$seq =~ s/ //g;
my @seq = split(//, $seq);

# Calculate scalars and acthley factors
my $aas = "ACDEFGHIKLMNPQRSTVWY";
my @scalar = ();
my @offset = ();
my @res = split(//, $aas);

for(my $i = 1; $i <= 5; $i++) {
  my $min = 15;
  my $max = - 15;
  for(my $j = 0; $j < @res; $j++) {

    if(atchley_factor($res[$j],$i) < $min) {
      $min = atchley_factor($res[$j], $i);
    }
    if(atchley_factor($res[$j],$i) > $max) {
      $max = atchley_factor($res[$j], $i);
    }

  }

  $scalar[$i] = (1/($max-$min));
  $offset[$i] = ($min/($min-$max));

}

my @factors = ();
for (my $i = 0; $i < @seq; $i++) {
  $factors[$i] = [];
  unshift(@{$factors[$i]}, 0);
}

# Get all factors, once
for (my $i = 0; $i < @seq; $i++) {
   for(my $j = 1; $j <= 5; $j++) {
    $factors[$i][$j] = atchley_factor($seq[$i], $j) * $scalar[$j] + $offset[$j];
   }
}

my $seq_len = length($seq);

# Determine composition of protein by amino acid type
my @comp_counts = ();
for(my $i = 0; $i < length($aas); $i++) {
  $comp_counts[$i] = 0;
}
for(my $i = 0; $i < @seq; $i++) {
  $comp_counts[index($aas, $seq[$i])]++;
}
for(my $i = 0; $i < length($aas); $i++) {
  $comp_counts[$i] = $comp_counts[$i]/$seq_len;
}

open PSSM, "<" . $pssm_fname || die "Couldn't open pssm file\n";
@lines = <PSSM>;
chomp(@lines);
close PSSM;

my ($odds_ref, $inf_ref) = scaled_log_odds_n_inf(@lines);
my @odds = @{$odds_ref};
my @inf = @{$inf_ref};

unshift(@odds, "");
unshift(@inf, "");

open SS_SA, "<" . $ss_sa_fname || die "Couldn't open ss_sa file\n";
my @ss_sa=<SS_SA>;
chomp @ss_sa;
my @ss = split(//, $ss_sa[2]);
my @sa = split(//, $ss_sa[3]);
close SS_SA;

my $beta_count = 0; my $alpha_count = 0;
foreach my $ss_t (@ss) {
  if($ss_t eq 'E') {
    $beta_count++;
  } 
  if($ss_t eq 'H') {
    $alpha_count++;
  }
}

my $exposed_count = 0;
foreach my $sa_t (@sa) {
  if($sa_t eq 'b') {
    $exposed_count++;
  } 
}

unshift(@factors,"");
unshift(@seq,"");
unshift(@ss,"");
unshift(@sa,"");

open DIST, "<" . $dist_fname || die "Couldn't open distance file\n";
@lines = <DIST>;
chomp(@lines);
close DIST;

my @distances = ();
for(my $i = 0; $i <= $seq_len; $i++) {
  $distances[$i] = ();
  for(my $j = 0; $j <= $seq_len; $j++) {
    $distances[$i][$j] = 100;
  }
}

foreach my $line_t (@lines) {
  my ($i, $j, $distance) = split(/ /, $line_t);
  $distances[$i][$j] = $distance;
  $distances[$j][$i] = $distance;
}


my @map = create_res_idx_map(int(($window_size - MAX_SEP)/2), $window_size, (MAX_SEP - MIN_SEP), MIN_SEP);

open FEAT, ">" . $feat_fname;
open TARG, ">" . $targ_fname;

for(my $i = 1; $i < $seq_len - $window_size -1; $i++) {
  # Encode cm target
  my $subcm_enc = '';

  for(my $j = 0; $j < @map; $j++) {
    my $idx_1 = ($i + $map[$j]->{'offset_1'});
    my $idx_2 = ($i + $map[$j]->{'offset_2'}); 

    if($distances[$idx_1][$idx_2] < $thresh) {
      $subcm_enc .= "1 ";
    } else {
      $subcm_enc .= "0 ";
    }
  }

  $subcm_enc =~ s/\s*$//;
  print TARG "# $i, res_idx_map params: | " . (int(($window_size - MAX_SEP)/2)) . ", " . $window_size . ", " . (MAX_SEP - MIN_SEP) . ", " . MIN_SEP . "\n";
  print TARG $subcm_enc . "\n";

  my @feature = ();
  # Add features for win 1
  for(my $k = 0; $k < int($window_size); $k++) {
 
     for(my $factor = 1; $factor <= 5; $factor++) {
       push @feature, sprintf("%.2f", $factors[$i+$k][$factor]);
     }

    if ($ss[$i+$k]  eq "H") {
      push @feature, 1;
      push @feature, 0;       
      push @feature, 0;       
    } elsif ($ss[$i+$k] eq "E") {
      push @feature, 0;
      push @feature, 1;       
      push @feature, 0;       
    } else {
      push @feature, 0;
      push @feature, 0;       
      push @feature, 1;       
    }

    if ($sa[$i+$k] eq "e") {
      push @feature, 1;
      push @feature, 0;       
    } else {
      push @feature, 0;
      push @feature, 1;       
    }

    # PSSM features
    push @feature, $inf[$i+$k];
    for(my $l = 0; $l < @{$odds[$i+$k]}; $l++) {
      push @feature, $odds[$i+$k][$l];
    }

  }


  if($seq_len < 75) { 
    push @feature, 1; push @feature, 0; push @feature, 0; push @feature, 0;
  } elsif ($seq_len < 150) {
    push @feature, 0; push @feature, 1; push @feature, 0; push @feature, 0;
  } elsif ($seq_len < 225) {
    push @feature, 0; push @feature, 0; push @feature, 1; push @feature, 0;
  } else {
    push @feature, 0; push @feature, 0; push @feature, 0; push @feature, 1;
  }

  for(my $i = 0; $i < length($aas); $i++) {
    push @feature, (sprintf("%.2f", $comp_counts[$i]));
  }

  # Global features
  push @feature, (sprintf "%.2f", (int(($i+$window_size)/2)/$seq_len));
  push @feature, (sprintf "%.2f", $exposed_count/$seq_len);
  push @feature, (sprintf "%.2f", $alpha_count/$seq_len);
  push @feature, (sprintf "%.2f", $beta_count/$seq_len);  
 
  # Spit out features
  for(my $k = 0; $k < @feature; $k++) {
    print FEAT " " . $feature[$k];
  }
  print FEAT "\n";

}

close TARG;
close FEAT;





