#!/usr/bin/perl
use strict;
use Env;

##########################################################################################
#                                                                                        #
#  Project     :  SCRATCH-1D                                                             #
#  Release     :  1.1                                                                    #
#                                                                                        #
#  Script      :  SCRATCH-1D_predictions.pl                                              #
#  Arguments   :  output_type input_fasta output_prefix num_threads                      #
#  Description :  root script of the SCRATCH-1D suite of predictors                      #
#                                                                                        #
#  Author(s)   :  Christophe Magnan (cmagnan@ics.uci.edu)                                #
#  Copyright   :  Institute for Genomics and Bioinformatics                              #
#                 University of California, Irvine                                       #
#                                                                                        #
#  Modified    :  2015/07/02                                                             #
#                                                                                        #
##########################################################################################

# Log Messages
my $script_name="SCRATCH-1D_predictions.pl"; my $m="[$script_name]";

# Script Inputs
my $usage="output_type input_fasta output_prefix num_threads";
if($#ARGV!=3){ print "Usage : ./$script_name $usage\n"; exit 1; }
my $output_type=$ARGV[0]; my $input_fasta=$ARGV[1];
my $output_prefix=$ARGV[2]; my $threads=$ARGV[3];

# Checking Script Inputs
if(($output_type ne "ab")&&($output_type ne "hom")){
print "$m unknown output file type : $output_type.\n"; exit 1; }
if(! -f "$input_fasta"){ print "$m input fasta file not found.\n"; exit 1; }
if(($threads!~/^[0-9]+$/)||(!(($threads>0)&&($threads<=256)))){
print "$m invalid number of threads : $threads.\n"; exit 1; }

# Project Dependencies
my $profilpro=$ENV{"PROFILpro_GET_PROFILES"};
my $num_rounds=$ENV{"PROFILpro_BLASTPGP_ROUNDS"};
my $sspro=$ENV{"SSpro_RUN_PREDICTOR"};
my $sspro8=$ENV{"SSpro8_RUN_PREDICTOR"};
my $accpro=$ENV{"ACCpro_RUN_PREDICTOR"};
my $accpro20=$ENV{"ACCpro20_RUN_PREDICTOR"};
my $homolpro=$ENV{"HOMOLpro_ADD_HOMOLOGY"};
my $predictors=$ENV{"HOMOLpro_PREDICTORS"};
if(! -f "$profilpro"){ print "$m PROFILpro scripts not found.\n"; exit 1; }
if(! -f "$sspro"){ print "$m SSpro scripts not found.\n"; exit 1; }
if(! -f "$sspro8"){ print "$m SSpro8 scripts not found.\n"; exit 1; }
if(! -f "$accpro"){ print "$m ACCpro scripts not found.\n"; exit 1; }
if(! -f "$accpro20"){ print "$m ACCpro20 scripts not found.\n"; exit 1; }
if(! -f "$homolpro"){ print "$m HOMOLpro scripts not found.\n"; exit 1; }

# Default File Extensions
my $suff_fa=$ENV{"SCRATCH1D_DATASET_FA"};
my $suff_pro=$ENV{"SCRATCH1D_DATASET_PRO"};
my $suff_ss=$ENV{"SCRATCH1D_DATASET_SS"};
my $suff_ss8=$ENV{"SCRATCH1D_DATASET_SS8"};
my $suff_acc=$ENV{"SCRATCH1D_DATASET_ACC"};
my $suff_acc20=$ENV{"SCRATCH1D_DATASET_ACC20"};

# Welcome Message
print "\n###################################\n#                       ";
print "          #\n#  SCRATCH-1D release 1.1 (2015)  #\n#            ";
print "                     #\n###################################\n\n";

# Temporary Work Folder & Files
my $tmp_folder=$ENV{"SCRATCH1D_TMP_DIR"};
if(! -d "$tmp_folder"){ print "$m temporary work folder not found.\n"; exit 1; }
my $job_id=`date '+%Y%m%d-%H%M%S'`; chomp($job_id); $job_id.="-".substr(rand(),-12);
my $work_dir="$tmp_folder/$job_id"; `mkdir $work_dir`; my $tmp_pref="$work_dir/dataset";
if(! -d "$work_dir"){ print "$m cannot create temporary workspace.\n"; exit 1; }
my $tmp_fa="$tmp_pref.$suff_fa"; my $tmp_pro="$tmp_pref.$suff_pro";
my $tmp_ss="$tmp_pref.$suff_ss"; my $tmp_ss8="$tmp_pref.$suff_ss8";
my $tmp_acc="$tmp_pref.$suff_acc"; my $tmp_acc20="$tmp_pref.$suff_acc20";

# Loading Input Protein Sequences
my $num_entries=0; my @ENT; my %ENT_prov; my %ENT_seq; my %ENT_len;
open(IN,"$input_fasta"); open(FA,">$tmp_fa"); my $header=<IN>;
chomp($header); while(my $l=<IN>){ if($header!~/^>/){
print "$m input file not in fasta file format.\n"; exit 1; }
my $prov=substr($header,1); chomp($l); my $seq=$l; my $done=0;
while($done==0){ if($l=<IN>){ chomp($l); if($l=~/^>/){ $header=$l; $done=1; }
else{ $seq.="$l"; } } else{ $done=1; } } $seq=~s/\s//sg; $seq=uc($seq);
$seq=~s/B/X/g; $seq=~s/J/X/g; $seq=~s/O/X/g; $seq=~s/U/X/g; $seq=~s/Z/X/g;
if($seq!~/^[ACDEFGHIKLMNPQRSTVWXY]+$/){ print "$m incorrect protein sequence :\n\n$seq\n\n";
exit 1; } my $len=length($seq); my $id="p$num_entries"; if($len<=10000){
push(@ENT,"$id"); $ENT_prov{"$id"}="$prov"; $ENT_seq{"$id"}="$seq";
$ENT_len{"$id"}=$len; print FA ">$id\n$seq\n"; $num_entries++; } }
close(IN); close(FA); print "$m $num_entries protein sequence(s) found\n";

# Generating Sequence Profiles
print "$m generating sequence profiles...\n";
`$profilpro $tmp_fa $tmp_pro $num_rounds $threads`;
`chmod 770 $tmp_fa $tmp_pro`; if(! -f "$tmp_pro"){
print "$m failed generating sequence profiles...\n"; exit 1; }

# Checking Sequence Profiles
open(PRO,"$tmp_pro"); for(my $i=0;$i<$num_entries;$i++){
my $id=$ENT[$i]; my $len=$ENT_len{"$id"}; my $l=<PRO>; chomp($l);
if($l!~">$id"){ print "$m failed generating sequence profiles...\n";
exit 1; } for(my $j=0;$j<$len;$j++){ $l=<PRO>; } } close(PRO);

# Running Predictors on Separate Threads
print "$m running SCRATCH-1D predictors...\n"; my @launchers;
push(@launchers,"$sspro $tmp_pro $tmp_ss 1 > /dev/null 2> /dev/null");
push(@launchers,"$sspro8 $tmp_pro $tmp_ss8 1 > /dev/null 2> /dev/null");
push(@launchers,"$accpro $tmp_pro $tmp_acc 1 > /dev/null 2> /dev/null");
push(@launchers,"$accpro20 $tmp_pro $tmp_acc20 1 > /dev/null 2> /dev/null");
my $num_batchs=$threads; if($num_batchs>4){ $num_batchs=4; } my @batch_sh;
my @batch_pred; my @batch_flag; my $currb=0; foreach my $com (@launchers){
$batch_pred[$currb].="$com\n"; $currb++; if($currb==$num_batchs){ $currb=0; } }
for(my $i=0;$i<$num_batchs;$i++){ my $t_bash="$work_dir/thread$i.sh";
my $t_flag="$work_dir/thread$i.finished"; my $t_pred=$batch_pred[$i];
push(@batch_sh,"$t_bash"); push(@batch_flag,"$t_flag"); open(SH,">$t_bash");
print SH "#!/bin/bash\n\n$t_pred"; print SH "echo 'done' > $t_flag\n";
close(SH); `chmod 770 $t_bash`; system("$t_bash &"); }

# Waiting for Predictors to Complete
my $num_completed=0; while($num_completed!=$num_batchs){ $num_completed=0;
for(my $i=0;$i<$num_batchs;$i++){ my $flag=$batch_flag[$i]; if(-f "$flag"){
$num_completed++; } } if($num_completed!=$num_batchs){ sleep(5); } }
for(my $i=0;$i<$num_batchs;$i++){ my $t_bash=$batch_sh[$i];
my $t_flag=$batch_flag[$i]; `rm -f $t_bash $t_flag`; } `rm -f $tmp_pro`;
if(! -f "$tmp_ss"){ print "$m cannot find SSpro predictions.\n"; exit 1; }
if(! -f "$tmp_ss8"){ print "$m cannot find SSpro8 predictions.\n"; exit 1; }
if(! -f "$tmp_acc"){ print "$m cannot find ACCpro predictions.\n"; exit 1; }
if(! -f "$tmp_acc20"){ print "$m cannot find ACCpro20 predictions.\n"; exit 1; }

# Running Homology Analysis IF Requested
if($output_type eq "hom"){ print "$m running homology analysis...\n";
`mv $tmp_ss $tmp_ss.ab; mv $tmp_ss8 $tmp_ss8.ab`;
`mv $tmp_acc $tmp_acc.ab; mv $tmp_acc20 $tmp_acc20.ab`;
my $inputs="$tmp_ss.ab,$tmp_ss8.ab,$tmp_acc.ab,$tmp_acc20.ab";
my $outputs="$tmp_ss,$tmp_ss8,$tmp_acc,$tmp_acc20";
`$homolpro $tmp_fa $predictors $inputs $outputs $threads`;
if((! -f "$tmp_ss")||(! -f "$tmp_ss8")){ print "$m HOMOLpro analysis failed.\n"; exit 1; }
if((! -f "$tmp_acc")||(! -f "$tmp_acc20")){ print "$m HOMOLpro analysis failed.\n"; exit 1; } }

# Writing Output File - SSpro
print "$m writing SSpro predictions...\n"; open(IN,"$tmp_ss");
open(OUT,">$output_prefix.$suff_ss") || die "$m cannot write output files...\n";
for(my $i=0;$i<$num_entries;$i++){ my $id=$ENT[$i]; my $prov=$ENT_prov{"$id"};
my $len=$ENT_len{"$id"}; my $head=<IN>; chomp($head); if($head ne ">$id"){
print "$m inconsistent SSpro predictions\n"; exit 1; } my $prediction=<IN>;
chomp($prediction); if((length($prediction)!=$len)||($prediction!~/^[ECH]+$/)){
print "$m inconsistent SSpro predictions\n"; exit 1; } print OUT ">$prov\n$prediction\n"; }
close(IN); close(OUT); `chmod 770 $output_prefix.$suff_ss`;

# Writing Output File - SSpro8
print "$m writing SSpro8 predictions...\n"; open(IN,"$tmp_ss8");
open(OUT,">$output_prefix.$suff_ss8") || die "$m cannot write output files...\n";
for(my $i=0;$i<$num_entries;$i++){ my $id=$ENT[$i]; my $prov=$ENT_prov{"$id"};
my $len=$ENT_len{"$id"}; my $head=<IN>; chomp($head); if($head ne ">$id"){
print "$m inconsistent SSpro8 predictions\n"; exit 1; } my $prediction=<IN>;
chomp($prediction); if((length($prediction)!=$len)||($prediction!~/^[EBCISTHG]+$/)){
print "$m inconsistent SSpro8 predictions\n"; exit 1; } print OUT ">$prov\n$prediction\n"; }
close(IN); close(OUT); `chmod 770 $output_prefix.$suff_ss8`;

# Writing Output File - ACCpro
print "$m writing ACCpro predictions...\n"; open(IN,"$tmp_acc");
open(OUT,">$output_prefix.$suff_acc") || die "$m cannot write output files...\n";
for(my $i=0;$i<$num_entries;$i++){ my $id=$ENT[$i]; my $prov=$ENT_prov{"$id"};
my $len=$ENT_len{"$id"}; my $head=<IN>; chomp($head); if($head ne ">$id"){
print "$m inconsistent ACCpro predictions\n"; exit 1; } my $prediction=<IN>;
chomp($prediction); if((length($prediction)!=$len)||($prediction!~/^[-e]+$/)){
print "$m inconsistent ACCpro predictions\n"; exit 1; } print OUT ">$prov\n$prediction\n"; }
close(IN); close(OUT); `chmod 770 $output_prefix.$suff_acc`;

# Writing Output File - ACCpro20
print "$m writing ACCpro20 predictions...\n"; open(IN,"$tmp_acc20");
open(OUT,">$output_prefix.$suff_acc20") || die "$m cannot write output files...\n";
for(my $i=0;$i<$num_entries;$i++){ my $id=$ENT[$i]; my $prov=$ENT_prov{"$id"};
my $len=$ENT_len{"$id"}; my $head=<IN>; chomp($head); if($head ne ">$id"){
print "$m inconsistent ACCpro20 predictions\n"; exit 1; } my $prediction=<IN>;
chomp($prediction); my @d=split(" ",$prediction); if(($#d+1)!=$len){
print "$m inconsistent ACCpro20 predictions\n"; exit 1; } print OUT ">$prov\n$prediction\n"; }
close(IN); close(OUT); `chmod 770 $output_prefix.$suff_acc20`;

# Finalizing Process
`rm -rf $work_dir`; print "$m job successfully completed!\n\n";

