#!/usr/bin/perl
use strict;
use Env;

##########################################################################################
#                                                                                        #
#  Project     :  ACCpro                                                                 #
#  Release     :  5.2                                                                    #
#                                                                                        #
#  Script      :  ACCpro_predictions.pl                                                  #
#  Arguments   :  input_type output_type input_file output_file num_threads              #
#  Description :  ACCpro root script to predict protein solvent accessibility            #
#                                                                                        #
#  Author(s)   :  Christophe Magnan (cmagnan@ics.uci.edu)                                #
#  Copyright   :  Institute for Genomics and Bioinformatics                              #
#                 University of California, Irvine                                       #
#                                                                                        #
#  Modified    :  2015/07/02                                                             #
#                                                                                        #
##########################################################################################

# Log Messages
my $script_name="ACCpro_predictions.pl"; my $m="[$script_name]";

# Script Inputs
my $usage="input_type output_type input_file output_file num_threads";
if($#ARGV!=4){ print "Usage : ./$script_name $usage\n"; exit 1; }
my $input_type=$ARGV[0]; my $output_type=$ARGV[1];
my $input_file=$ARGV[2]; my $output_file=$ARGV[3]; my $threads=$ARGV[4];

# Checking Script Inputs
if(($input_type ne "seq")&&($input_type ne "pro")){
print "$m unknown input file type : $input_type.\n"; exit 1; }
if(($output_type ne "ab")&&($output_type ne "hom")){
print "$m unknown output file type : $output_type.\n"; exit 1; }
if(! -f "$input_file"){ print "$m input file not found.\n"; exit 1; }
if(($threads!~/^[0-9]+$/)||(!(($threads>0)&&($threads<=256)))){
print "$m invalid number of threads : $threads.\n"; exit 1; }

# Project Folders & Data
my $models_list=$ENV{"ACCPRO_MODELS_LIST"}; my $tmp_folder=$ENV{"ACCPRO_TMP_DIR"};
if(! -f "$models_list"){ print "$m prediction models not found.\n"; exit 1; }
if(! -d "$tmp_folder"){ print "$m temporary work folder not found.\n"; exit 1; }

# Project Library Scripts
my $profil2scratch=$ENV{"ACCPRO_PROFIL_2_SCRATCH"};
my $brnn_inputs=$ENV{"ACCPRO_SCRATCH_2_BRNN"};
my $get_abinitio=$ENV{"ACCPRO_GET_ABINITIO"};
if(! -f "$profil2scratch"){ print "$m library scripts not found.\n"; exit 1; }
if(! -f "$brnn_inputs"){ print "$m library scripts not found.\n"; exit 1; }
if(! -f "$get_abinitio"){ print "$m library scripts not found.\n"; exit 1; }

# Project Dependencies
my $profilpro=$ENV{"PROFILpro_GET_PROFILES"};
my $homolpro=$ENV{"HOMOLpro_ADD_HOMOLOGY"};
my $predictmulti=$ENV{"BRNN1D_PREDICT_MULTI"};
if(! -f "$profilpro"){ print "$m PROFILpro scripts not found.\n"; exit 1; }
if(! -f "$homolpro"){ print "$m HOMOLpro scripts not found.\n"; exit 1; }
if(! -f "$predictmulti"){ print "$m 1D-BRNN binaries not found.\n"; exit 1; }

# Default File Extensions
my $suff_ids=$ENV{"ACCPRO_DATASET_IDS"};
my $suff_fa=$ENV{"ACCPRO_DATASET_FA"};
my $suff_dssp=$ENV{"ACCPRO_DATASET_DSSP"};
my $suff_pro=$ENV{"ACCPRO_DATASET_PROF"};
my $suff_in=$ENV{"ACCPRO_DATASET_INPUTS"};
my $suff_out=$ENV{"ACCPRO_DATASET_OUTPUTS"};

# Welcome Message
print "\n###############################\n"; print "#                             #\n";
print "#  ACCpro release 5.2 (2015)  #\n"; print "#                             #\n";
print "###############################\n\n";

# Temporary Work Folder & Files
my $job_id=`date '+%Y%m%d-%H%M%S'`; chomp($job_id); $job_id.="-".substr(rand(),-12);
my $work_dir="$tmp_folder/$job_id"; `mkdir $work_dir`; if(! -d "$work_dir"){
print "$m cannot create temporary workspace.\n"; exit 1; } my $tmp_pref="$work_dir/accpro";
my $tmp_ids="$tmp_pref.$suff_ids"; my $tmp_fa="$tmp_pref.$suff_fa";
my $tmp_dssp="$tmp_pref.$suff_dssp"; my $tmp_pro="$tmp_pref.$suff_pro";
my $tmp_in="$tmp_pref.$suff_in"; my $tmp_out="$tmp_pref.$suff_out";
my $tmp_ab="$tmp_pref.ab"; my $tmp_pred="$tmp_pref.pred";

# Input File Type = seq : Generating Sequence Profiles
my $num_entries=0; my @ENT; my %ENT_prov; my %ENT_seq; my %ENT_len;
if($input_type eq "seq"){ open(IDS,">$tmp_ids"); open(FA,">$tmp_fa");
open(DSSP,">$tmp_dssp"); open(IN,"$input_file"); my $header=<IN>; chomp($header);
while(my $l=<IN>){ if($header!~/^>/){ print "$m input file not in fasta file format.\n";
exit 1; } my $prov=substr($header,1); chomp($l); my $seq=$l; my $done=0;
while($done==0){ if($l=<IN>){ chomp($l); if($l=~/^>/){ $header=$l; $done=1; }
else{ $seq.="$l"; } } else{ $done=1; } } $seq=~s/\s//sg; $seq=uc($seq);
if($seq!~/^[A-Z]+$/){ print "$m incorrect protein sequence :\n\n$seq\n\n"; exit 1; }
my $len=length($seq); my @DSSPCL; for(my $i=0;$i<$len;$i++){ push(@DSSPCL,"e"); }
my $dssp=join("",@DSSPCL); my $id="p$num_entries"; push(@ENT,"$id");
$ENT_prov{"$id"}="$prov"; $ENT_seq{"$id"}="$seq"; $ENT_len{"$id"}=$len;
print IDS "$id\n"; print FA ">$id\n$seq\n"; print DSSP ">$id\n$dssp\n";
$num_entries++; } close(IN); close(DSSP); close(FA); close(IDS);
print "$m $num_entries protein sequence(s) found\n$m generating sequence profiles...\n";
`$profilpro $tmp_fa $tmp_pro 3 $threads; chmod 770 $tmp_ids $tmp_fa $tmp_dssp $tmp_pro`;
if(! -f "$tmp_pro"){ print "$m failed generating sequence profiles...\n"; exit 1; } }

# Input File Type = pro : Checking Profiles & Extracting Sequences
elsif($input_type eq "pro"){ open(IDS,">$tmp_ids"); open(FA,">$tmp_fa");
open(DSSP,">$tmp_dssp"); open(PRO,">$tmp_pro"); open(IN,"$input_file");
my $header=<IN>; chomp($header); while(my $l=<IN>){ if($header!~/^>/){
print "$m incorrect input file format.\n"; exit 1; } my $prov=substr($header,1);
chomp($l); my @SEQ; my @PROF; my @DSSPCL; my $len=0; my @d=split(" ",$l);
if($#d!=20){ print "$m incorrect input file format.\n"; exit 1; } push(@SEQ,"$d[0]");
push(@PROF,"$l"); push(@DSSPCL,"e"); $len++; my $done=0; while($done==0){ if($l=<IN>){
chomp($l); if($l=~/^>/){ $header=$l; $done=1; } else{ my @T=split(" ",$l); if($#T!=20){
print "$m incorrect input file format.\n"; exit 1; } push(@SEQ,"$T[0]"); push(@PROF,"$l");
push(@DSSPCL,"e"); $len++; } } else{ $done=1; } } my $seq=join("",@SEQ); $seq=~s/\s//sg;
$seq=uc($seq); if($seq!~/^[A-Z]+$/){ print "$m incorrect protein sequence :\n\n$seq\n\n";
exit 1; } if(length($seq)!=$len){ print "$m incorrect input file format.\n"; exit 1; }
my $dssp=join("",@DSSPCL); my $id="p$num_entries"; push(@ENT,"$id"); $ENT_prov{"$id"}="$prov";
$ENT_seq{"$id"}="$seq"; $ENT_len{"$id"}=$len; print IDS "$id\n"; print FA ">$id\n$seq\n";
print DSSP ">$id\n$dssp\n"; print PRO ">$id\n".join("\n",@PROF)."\n"; $num_entries++; }
print "$m input : $num_entries sequence profile(s) found\n"; close(IN); close(DSSP);
close(FA); close(IDS); close(PRO); `chmod 770 $tmp_ids $tmp_fa $tmp_dssp $tmp_pro`; }

# Generating ab-initio Predictions from Sequence Profiles
print "$m preparing input file for BRNNs...\n";
`$profil2scratch $tmp_fa $tmp_dssp $tmp_pro $tmp_pro.tmp`;
if(! -f "$tmp_pro.tmp"){ print "$m failed to prepare 1D-BRNN dataset\n"; exit 1; }
`$brnn_inputs $tmp_ids $tmp_pro.tmp $tmp_in; chmod 770 $tmp_in; rm -f $tmp_pro.tmp`;
if(! -f "$tmp_in"){ print "$m failed to prepare 1D-BRNN dataset\n"; exit 1; }
print "$m propagating profiles in BRNNs...\n";
`$predictmulti $tmp_in $models_list $tmp_out; chmod 770 $tmp_out`;
if(! -f "$tmp_out"){ print "$m failed to apply prediction models\n"; exit 1; }
print "$m extracting ab-initio predictions...\n";
`$get_abinitio $tmp_fa $tmp_ids $tmp_out $tmp_ab $tmp_pred; chmod 770 $tmp_ab $tmp_pred`;
if(! -f "$tmp_ab"){ print "$m inconsistent predictions, cannot proceed\n"; exit 1; }

# Running Homology Analysis IF Requested
if($output_type eq "hom"){ print "$m performing homology analysis...\n";
`rm -f $tmp_pred`; `$homolpro $tmp_fa acc $tmp_ab $tmp_pred $threads`;
if(! -f "$tmp_pred"){ print "$m failed to process ab-initio predictions\n"; exit 1; }
`chmod 770 $tmp_pred`; } else{ `rm -f $tmp_pred; mv $tmp_ab $tmp_pred`; }

# Writing Output File & Removing Temporary Files
print "$m writing predictions in output file...\n"; open(IN,"$tmp_pred");
open(OUT,">$output_file") || die "$m cannot write output file $output_file...\n";
for(my $i=0;$i<$num_entries;$i++){ my $id=$ENT[$i]; my $prov=$ENT_prov{"$id"};
my $len=$ENT_len{"$id"}; my $header=<IN>; chomp($header);
if(($header!~/^>/)||(substr($header,1) ne "$id")){
print "$m cannot retrieve predictions!\n"; exit 1; } my $prediction=<IN>;
chomp($prediction); if((length($prediction)!=$len)||($prediction!~/^[-e]+$/)){
print "$m cannot retrieve predictions!\n"; exit 1; }
print OUT ">$prov\n$prediction\n"; } close(IN); close(OUT); `chmod 770 $output_file`;
`rm -rf $work_dir`; print "$m done! job successfully completed\n\n";

