#!/usr/bin/perl
use strict;
use Env;

##########################################################################################
#                                                                                        #
#  Project     :  ACCpro20                                                               #
#  Release     :  5.2                                                                    #
#                                                                                        #
#  Script      :  abinitio_predictions.pl                                                #
#  Arguments   :  input_fasta dataset_ids brnn_outputs pred_classes pred_prob            #
#  Description :  rewrites 1D-BRNN output files in standard file formats for users       #
#                                                                                        #
#  Author(s)   :  Christophe Magnan (cmagnan@ics.uci.edu)                                #
#  Copyright   :  Institute for Genomics and Bioinformatics                              #
#                 University of California, Irvine                                       #
#                                                                                        #
#  Modified    :  2015/07/02                                                             #
#                                                                                        #
##########################################################################################

# Log Messages
my $script_name="abinitio_predictions.pl"; my $m="[$script_name]";

# Script Inputs
my $usage="input_fasta dataset_ids brnn_outputs pred_classes pred_prob";
if($#ARGV!=4){ print "Usage : ./$script_name $usage\n"; exit 1; }
my $input_fasta=$ARGV[0]; my $dataset_ids=$ARGV[1]; my $brnn_output=$ARGV[2];
my $pred_classes=$ARGV[3]; my $pred_prob=$ARGV[4]; my $num_classes=20;

# Checking Script Inputs
if(! -f "$input_fasta"){ print "$m input fasta file not found.\n"; exit 1; }
if(! -f "$dataset_ids"){ print "$m dataset identifiers not found.\n"; exit 1; }
if(! -f "$brnn_output"){ print "$m 1D-BRNN predictions not found.\n"; exit 1; }

# Retrieving Protein Sequences
my $num_ent=0; my @ent; my @ent_seq; my @ent_len; my %index; open(IN,"$input_fasta");
my $header=<IN>; chomp($header); while(my $l=<IN>){ my $id=substr($header,1);
if($header!~/^>/){ print "$m input not in fasta file format.\n"; exit 1; }
if($l=~/^>/){ print "$m empty sequence found.\n"; exit 1; } chomp($l); my $seq=$l;
my $done=0; while($done==0){ if($l=<IN>){ chomp($l); if($l=~/^>/){ $done=1; $header=$l; }
else{ $seq.="$l"; } } else{ $done=1; } } $seq=~s/\s//sg; my $len=length($seq);
if($index{"$id"} ne ""){ print "$m duplicate ids...\n"; exit 1; } $index{"$id"}=$num_ent;
$num_ent++; push(@ent,"$id"); push(@ent_seq,"$seq"); push(@ent_len,"$len"); } close(IN);

# Retrieving Dataset IDs
my %selected; my $num_sel=0; my @list; open(IN,"$dataset_ids"); while(my $l=<IN>){
chomp($l); if($selected{"$l"} ne ""){ print "$m duplicate entries.\n"; exit 1; }
if($index{"$l"} eq ""){ print "$m unknown entry $l\n"; exit 1; }
$num_sel++; $selected{"$l"}="1"; push(@list,"$l"); } close(IN);

# Rewriting 1D-BRNN Predictions
open(IN,"$brnn_output"); open(OUTCL,">$pred_classes"); open(OUTPB,">$pred_prob");
for(my $e=0;$e<$num_sel;$e++){ my $id=$list[$e]; my $pos=$index{"$id"}; my $len=$ent_len[$pos];
my @SEQ=split("",$ent_seq[$pos]); my $head=<IN>; chomp($head); if($head ne "$len"){
print "$m mismatch length seq/pred\n"; exit 1; } print OUTCL ">$id\n"; print OUTPB ">$id\n";
for(my $p=0;$p<$len;$p++){ my $l=<IN>; chomp($l); my @d=split(" ",$l); if($#d!=($num_classes+4)){
print "$m inconsistent BRNN outputs.\n"; exit 1; } my $amino=$SEQ[$p]; my $sum=0; my $predcl=0;
my $class=0; for(my $cl=0;$cl<$num_classes;$cl++){ $sum+=$d[5+$cl]; if($sum > 0.5){ $predcl=$cl;
$class=$cl*5; $cl=$num_classes; } } print OUTCL "$class "; print OUTPB "$amino $class $predcl";
for(my $i=0;$i<$num_classes;$i++){ print OUTPB " ".sprintf("%.6f",$d[5+$i]); } print OUTPB "\n"; }
print OUTCL "\n"; } close(IN); close(OUTCL); close(OUTPB); `chmod 770 $pred_classes $pred_prob`;

