#!/usr/bin/perl
use strict;
use Env;

##########################################################################################
#                                                                                        #
#  Project     :  SSpro8                                                                 #
#  Release     :  5.2                                                                    #
#                                                                                        #
#  Script      :  abinitio_predictions.pl                                                #
#  Arguments   :  input_fasta dataset_ids brnn_outputs pred_classes pred_prob            #
#  Description :  rewrites 1D-BRNN output files in standard file formats for users       #
#                                                                                        #
#  Author(s)   :  Christophe Magnan (cmagnan@ics.uci.edu)                                #
#  Copyright   :  Institute for Genomics and Bioinformatics                              #
#                 University of California, Irvine                                       #
#                                                                                        #
#  Modified    :  2015/07/02                                                             #
#                                                                                        #
##########################################################################################

# Log Messages
my $script_name="abinitio_predictions.pl"; my $m="[$script_name]";

# Script Inputs
my $usage="input_fasta dataset_ids brnn_outputs pred_classes pred_prob";
if($#ARGV!=4){ print "Usage : ./$script_name $usage\n"; exit 1; }
my $input_fasta=$ARGV[0]; my $dataset_ids=$ARGV[1]; my $brnn_output=$ARGV[2];
my $pred_classes=$ARGV[3]; my $pred_prob=$ARGV[4]; my $num_classes=8;
my %sspro8_classes=("H"=>"0","G"=>"1","I"=>"2","E"=>"3","B"=>"4","T"=>"5","S"=>"6",
"C"=>"7","0"=>"H","1"=>"G","2"=>"I","3"=>"E","4"=>"B","5"=>"T","6"=>"S","7"=>"C");

# Checking Script Inputs
if(! -f "$input_fasta"){ print "$m input fasta file not found.\n"; exit 1; }
if(! -f "$dataset_ids"){ print "$m dataset identifiers not found.\n"; exit 1; }
if(! -f "$brnn_output"){ print "$m 1D-BRNN predictions not found.\n"; exit 1; }

# Retrieving Protein Sequences
my $num_ent=0; my @ent; my @ent_seq; my @ent_len; my %index; open(IN,"$input_fasta");
my $header=<IN>; chomp($header); while(my $l=<IN>){ my $id=substr($header,1);
if($header!~/^>/){ print "$m input not in fasta file format.\n"; exit 1; }
if($l=~/^>/){ print "$m empty sequence found.\n"; exit 1; } chomp($l); my $seq=$l;
my $done=0; while($done==0){ if($l=<IN>){ chomp($l); if($l=~/^>/){ $done=1; $header=$l; }
else{ $seq.="$l"; } } else{ $done=1; } } $seq=~s/\s//sg; my $len=length($seq);
if($index{"$id"} ne ""){ print "$m duplicate ids...\n"; exit 1; } $index{"$id"}=$num_ent;
$num_ent++; push(@ent,"$id"); push(@ent_seq,"$seq"); push(@ent_len,"$len"); } close(IN);

# Retrieving Dataset IDs
my %selected; my $num_sel=0; my @list; open(IN,"$dataset_ids"); while(my $l=<IN>){
chomp($l); if($selected{"$l"} ne ""){ print "$m duplicate entries.\n"; exit 1; }
if($index{"$l"} eq ""){ print "$m unknown entry $l\n"; exit 1; }
$num_sel++; $selected{"$l"}="1"; push(@list,"$l"); } close(IN);

# Rewriting 1D-BRNN Predictions
open(IN,"$brnn_output"); open(OUTCL,">$pred_classes"); open(OUTPB,">$pred_prob");
for(my $e=0;$e<$num_sel;$e++){ my $id=$list[$e]; my $pos=$index{"$id"}; my $len=$ent_len[$pos];
my @SEQ=split("",$ent_seq[$pos]); my $head=<IN>; chomp($head); if($head ne "$len"){
print "$m mismatch length seq/pred\n"; exit 1; } print OUTCL ">$id\n"; print OUTPB ">$id\n";
for(my $p=0;$p<$len;$p++){ my $l=<IN>; chomp($l); my @d=split(" ",$l); if($#d!=($num_classes+4)){
print "$m inconsistent BRNN outputs.\n"; exit 1; } my $amino=$SEQ[$p]; my $predcl=$d[3];
if($sspro8_classes{"$predcl"} eq ""){ print "$m inconsistent BRNN outputs.\n"; exit 1; }
my $class=$sspro8_classes{"$predcl"}; print OUTCL "$class"; print OUTPB "$amino $class $predcl";
for(my $i=0;$i<$num_classes;$i++){ print OUTPB " ".sprintf("%.6f",$d[5+$i]); } print OUTPB "\n"; }
print OUTCL "\n"; } close(IN); close(OUTCL); close(OUTPB); `chmod 770 $pred_classes $pred_prob`;

