#!/usr/bin/perl
use strict;
use Env;

##########################################################################################
#                                                                                        #
#  Project     :  PROFILpro                                                              #
#  Release     :  1.1                                                                    #
#                                                                                        #
#  Script      :  generate_profiles.pl                                                   #
#  Arguments   :  database_dir blastdb rounds input_fasta output_file threads            #
#  Description :  Generates sequence profiles based on the requested database            #
#                                                                                        #
#  Author(s)   :  Christophe Magnan (cmagnan@ics.uci.edu)                                #
#  Copyright   :  Institute for Genomics and Bioinformatics                              #
#                 University of California, Irvine                                       #
#                                                                                        #
#  Modified    :  2015/07/02                                                             #
#                                                                                        #
##########################################################################################

# Log Messages
my $script_name="generate_profiles.pl"; my $m="[$script_name]";

# Script Inputs
my $usage="database_dir blastdb rounds input_fasta output_file threads";
if($#ARGV!=5){ print "Usage : ./$script_name $usage\n"; exit 1; }
my $db_dir=$ARGV[0]; my $blastdb=$ARGV[1]; my $rounds=$ARGV[2];
my $fasta=$ARGV[3]; my $output=$ARGV[4]; my $threads=$ARGV[5];

# Checking Script Inputs
if(! -d "$db_dir"){ print "$m database folder not found.\n"; exit 1; }
if(!(($rounds>=2)&&($rounds<=4))){ print "$m invalid number of rounds.\n"; exit 1; }
if(! -f "$fasta"){ print "$m input fasta file not found.\n"; exit 1; }
if($threads!~/^[0-9]+$/){ print "$m invalid number of threads.\n"; exit 1; }
if(!(($threads>0)&&($threads<=256))){ print "$m invalid number of threads.\n"; exit 1; }

# Retrieving Protein Sequences
my $num_entries=0; my @entries; my %provided; my %sequence; my %seqlen;
open(IN,"$fasta"); my $header=<IN>; chomp($header); while(my $l=<IN>){ if($header!~/^>/){
print "$m input file not in fasta file format.\n"; exit 1; } my $prov=substr($header,1);
chomp($l); my $seq=$l; my $done=0; while($done==0){ if($l=<IN>){ chomp($l); if($l=~/^>/){
$done=1; $header=$l; } else{ $seq.="$l"; } } else{ $done=1; } } $seq=~s/\s//sg;
$seq=uc($seq); $seq=~s/B/X/g; $seq=~s/J/X/g; $seq=~s/O/X/g; $seq=~s/U/X/g; $seq=~s/Z/X/g;
if($seq!~/^[ACDEFGHIKLMNPQRSTVWXY]+$/){ print "\n$m invalid sequence found:\n\n$seq\n\n";
exit 1; } my $len=length($seq); my $id="p$num_entries"; if($len<=10000){ push(@entries,"$id");
$provided{"$id"}="$prov"; $sequence{"$id"}="$seq"; $seqlen{"$id"}="$len"; $num_entries++; } }
close(IN); if($num_entries==0){ print "$m input fasta file is empty\n"; exit 1; }

# Creating Batchs For Threading
my $num_batchs=$threads; if($num_entries<$num_batchs){ $num_batchs=$num_entries; }
my @batchs; my $n=0; for(my $i=0;$i<$num_entries;$i++){ my $id=$entries[$i];
$batchs[$n].="$id,"; $n++; if($n==$num_batchs){ $n=0; } }

# Preparing Temporary Workspace
my $lib_dir=$ENV{"PROFILpro_LIB_DIR"}; my $tmp_dir=$ENV{"PROFILpro_TMP_DIR"};
if(! -d "$lib_dir"){ print "$m PROFILpro lib folder not found.\n"; exit 1; }
if(! -d "$tmp_dir"){ print "$m PROFILpro tmp folder not found.\n"; exit 1; }
my $job_id=`date '+%Y%m%d-%H%M%S'`; chomp($job_id); $job_id.="-".substr(rand(),-12);
my $work_dir="$tmp_dir/$job_id"; `mkdir $work_dir`;
if(! -d "$work_dir"){ print "$m cannot create temporary workspace.\n"; exit 1; }

# Writing Batchs of Sequences
my @batchs_input; my @batchs_output; my @batchs_flags;
for(my $i=0;$i<$num_batchs;$i++){ $batchs_input[$i]="thread$i.fa";
$batchs_output[$i]="thread$i.out"; $batchs_flags[$i]="thread$i.finished";
my @list=split(",",$batchs[$i]); open(OUT,">$work_dir/$batchs_input[$i]");
foreach my $e (@list){ if($e ne ""){ print OUT ">$e\n".$sequence{"$e"}."\n"; } }
close(OUT); `chmod 770 $work_dir/$batchs_input[$i]`; }

# Starting Jobs In Background
my $process_batch="$lib_dir/process_full_batch.pl"; if(! -f "$process_batch"){
print "$m cannot find PROFILpro lib scripts.\n"; exit 1; } for(my $i=0;$i<$num_batchs;$i++){
my $cmd="$process_batch $db_dir $blastdb $rounds $work_dir $batchs_input[$i] ";
$cmd.="$batchs_output[$i] $batchs_flags[$i] thread_$i"; system("$cmd &"); }

# Waiting For Jobs To Complete
my $num_finished=0; while($num_finished!=$num_batchs){ sleep(60); $num_finished=0;
for(my $i=0;$i<$num_batchs;$i++){ if(-f "$work_dir/$batchs_flags[$i]"){
$num_finished++; } } print "$m $num_finished threads have completed\n"; }

# Retrieving Sequence Profiles
my %finished; my %profile; for(my $i=0;$i<$num_batchs;$i++){
my $error="$m thread #$i failed to extract profiles."; if(-z "$work_dir/$batchs_flags[$i]"){
`rm -rf $work_dir/$batchs_flags[$i] $work_dir/$batchs_input[$i]`;
if(! -f "$work_dir/$batchs_output[$i]"){ print "$error 1\n"; exit 1; }
open(IN,"$work_dir/$batchs_output[$i]"); while(my $l=<IN>){ if($l!~/^>/){ print "$error 2\n";
exit 1; } chomp($l); my $id=substr($l,1); if($sequence{"$id"} eq ""){ print "$error 3\n"; exit 1; }
my @SEQ=split("",$sequence{"$id"}); my $len=$seqlen{"$id"}; my @PROFIL; for(my $j=0;$j<$len;$j++){
if($l=<IN>){ chomp($l); my @d=split(" ",$l); if($#d!=20){ print "$error 4\n"; exit 1; }
if($d[0]!=$SEQ[$j]){ print "$error 5\n"; exit 1; } push(@PROFIL,"$l"); } else{ print "$error 6\n";
exit 1; } } $profile{"$id"}=join("\n",@PROFIL); $finished{"$id"}=1; } close(IN);
`rm -rf $work_dir/$batchs_output[$i]`; } else{ print "$error 7\n"; exit 1; } }

# Writing Sequence Profiles
for(my $i=0;$i<$num_entries;$i++){ if($finished{"$entries[$i]"}!=1){
print "$m unprocessed entries!\n"; exit 1; } } print "$m writing profiles in output...\n\n";
open(OUT,">$output"); for(my $i=0;$i<$num_entries;$i++){ my $id=$entries[$i];
my $prov=$provided{"$id"}; my $profile=$profile{"$id"}; print OUT ">$prov\n$profile\n"; }
close(OUT); `chmod 770 $output; rm -rf $work_dir`;

