#!/bin/bash # Written by Karolis Uziela in 2015 # ------------------------------- Reading in arguments ------------------------------- # SCRIPT_NAME=`basename $0` PROQ3_DIR=`dirname $0` source $PROQ3_DIR/paths.sh # Read in paths function usage { echo " NOTE: It is easier and recommended to run ProQ3/ProQ3D using run_proq3.sh script. If you are running this script manually, make sure you know what you are doing. Usage: $SCRIPT_NAME [parameters] -m/--model [pdb-model] PDB model file to be evaluated -d/--deep [yes/no] default=no Should we use Deep Learning (Theano) instead of SVM? Runs ProQ3D -r/--repack [yes/no] default=yes Should we perform the side chain repacking step? -t/--target_length [Number_of_residues] default='model_length' The global score is calculated as sum_of_local/target_length -k/--keep_files [yes/no] default=no Should we keep the repacked models, svm input files and psiblast files? -l/--local_output [output_file_local] default=[pdb-model].proq3.local The output file for local predictions -g/--global_output [output_file_global] default=[pdb-model].proq3.global The output file for global predictions -o/--rosetta_log [rosetta_log_file] default=[pdb-model].rosetta.log The output file with Rosetta logs --debug_mode [yes/no] default=no Debug mode keeps all temporary files " exit 1 } if [[ $# -lt 2 ]] ; then echo "ERROR: too few suplied arguments" usage fi while [[ $# > 1 ]] do key="$1" case $key in -m|--model) MODEL="$2" shift # past argument ;; -d|--deep) DEEP="$2" shift # past argument ;; -r|--repack) REPACK="$2" shift # past argument ;; -t|--target_length) TARGET_LENGTH="$2" shift # past argument ;; -k|--keep) KEEP="$2" shift # past argument ;; -g|--global_output) GLOBAL_OUT="$2" shift # past argument ;; -l|--local_output) LOCAL_OUT="$2" shift # past argument ;; -o|--rosetta_log) ROSETTA_LOG="$2" shift # past argument ;; -d|--debug_mode) DEBUG="$2" shift # past argument ;; *) echo "ERROR: Unknown argument: $1" usage ;; esac shift # past argument or value done if [[ -n $1 ]]; then echo "ERROR: Argument without any supplied value: $1" usage fi if [[ $MODEL == "" ]] ; then echo "ERROR: -m/--model parameter is not set" usage elif [[ ! -f $MODEL ]] ; then echo "ERROR: model file $MODEL does not exist" usage elif [[ ${MODEL: -4} != ".pdb" ]] ; then echo "WARNING: input file does not have .pdb extension. Creating a copy of input file with .pdb extension..." cp $MODEL $MODEL.pdb for ext in psi mtx ss2 acc ; do if [[ -f $MODEL.$ext && ! -f $MODEL.pdb.$ext ]] ; then cp $MODEL.$ext $MODEL.pdb.$ext fi done MODEL=$MODEL.pdb fi if [[ $TARGET_LENGTH != "" ]] ; then re='^[0-9]+$' if ! [[ $TARGET_LENGTH =~ $re ]] ; then echo "ERROR: -t/--target_length must be a number. You supplied: $TARGET_LENGTH" usage fi fi if [[ $GLOBAL_OUT == "" ]] ; then GLOBAL_OUT=$MODEL.proq3.global fi if [[ $LOCAL_OUT == "" ]] ; then LOCAL_OUT=$MODEL.proq3.local fi if [[ -f $GLOBAL_OUT ]] ; then echo "ERROR: The output file $GLOBAL_OUT already exists. Remove the file before (re)running ProQ3." exit 1 fi if [[ -f $LOCAL_OUT ]] ; then echo "ERROR: The output file $LOCAL_OUT already exists. Remove the file before (re)running ProQ3." exit 1 fi if [[ $DEEP == "no" ]] || [[ $DEEP == "No" ]] || [[ $DEEP == "NO" ]] || [[ $DEEP == "" ]] ; then DEEP="no" elif [[ $DEEP == "yes" ]] || [[ $DEEP == "Yes" ]] || [[ $DEEP == "YES" ]] ; then DEEP="yes" if [[ "$DEEP_INSTALLED" == "no" ]] ; then echo "ERROR: You are trying to run the deep learning version of predictor (ProQ3D), but you don't have python with keras and numpy packages installed" exit 1 fi else echo "ERROR: -d/--deep $DEEP value is not recognized (should be yes or no)" usage fi if [[ $REPACK == "yes" ]] || [[ $REPACK == "Yes" ]] || [[ $REPACK == "YES" ]] || [[ $REPACK == "" ]] ; then R=".repacked" M=".minimized" elif [[ $REPACK == "no" ]] || [[ $REPACK == "No" ]] || [[ $REPACK == "NO" ]] ; then R="" M="" else echo "ERROR: -r/--reppack $REPACK value is not recognized" usage fi if [[ $ROSETTA_LOG == "" ]] ; then MODEL_PATH=`readlink -f $MODEL` ROSETTA_LOG="${MODEL_PATH}.rosetta.log" fi if [[ $KEEP == "no" ]] || [[ $KEEP == "No" ]] || [[ $KEEP == "NO" ]] || [[ $KEEP == "" ]] ; then KEEP="no" elif [[ $KEEP == "yes" ]] || [[ $KEEP == "Yes" ]] || [[ $KEEP == "YES" ]] ; then KEEP="yes" else echo "ERROR: -k/--keep $KEEP value is not recognized (should be yes or no)" usage fi if [[ $DEBUG == "no" ]] || [[ $DEBUG == "No" ]] || [[ $DEBUG == "NO" ]] || [[ $DEBUG == "" ]] ; then DEBUG="no" elif [[ $DEBUG == "yes" ]] || [[ $DEBUG == "Yes" ]] || [[ $DEBUG == "YES" ]] ; then DEBUG="yes" else echo "ERROR: --debug_mode $DEBUG value is not recognized (should be yes or no)" usage fi if [[ ! -f $MODEL.ss2 || ! -f $MODEL.acc || ! -f $MODEL.psi || ! -f $MODEL.mtx ]] ; then echo "ERROR: .ss2 .acc .psi or .mtx file is missing. Run /bin/run_all_external.pl first. Check README for details." exit 1 fi # Filter out HETATM records hetatm=`grep "HETATM" $MODEL | wc -l` if [[ "$hetatm" != "0" ]] ; then grep -v "HETATM" $MODEL > ${MODEL}.no_hetatm.pdb for ext in psi mtx ss2 acc ; do cp $MODEL.$ext ${MODEL}.no_hetatm.pdb.$ext done MODEL=${MODEL}.no_hetatm.pdb fi if [[ $R == ".repacked" ]] ; then if [ ! -f $MODEL$R.ss2 ] ; then cp $MODEL.ss2 $MODEL$R.ss2 cp $MODEL.acc $MODEL$R.acc cp $MODEL.psi $MODEL$R.psi cp $MODEL.mtx $MODEL$R.mtx fi fi # -------------------------------- Main script --------------------------------------- # MODEL_DIR=`dirname $MODEL` CURRENT_DIR=`pwd` BASE=`basename $MODEL .pdb` REPACK_ITERATIONS=1 if [[ ! -f $MODEL$R ]] ; then echo "------------------- Repacking the side chains -----------------" $ROSETTA_BIN/relax.linuxgccrelease -database $ROSETTA_DB -in:file:fullatom -out:file:silent_struct_type binary -nstruct $REPACK_ITERATIONS -relax:script $PROQ3_DIR/ProQ3_scripts/repack_min.script -in:file:s $MODEL -out:file:silent $MODEL$R.silent -relax:constrain_relax_to_start_coords -ignore_zero_occupancy F >>$ROSETTA_LOG if [[ -f $MODEL$R.silent ]] ; then to_extract=`grep ^SCORE $MODEL$R.silent | tail -n +2 | sort -n -k2 | head -n 1 | awk '{print $(NF)}'` $ROSETTA_BIN/extract_pdbs.linuxgccrelease -database $ROSETTA_DB -in:file:silent $MODEL$R.silent -in::file::tags $to_extract -out::prefix $MODEL_DIR/ -ignore_zero_occupancy F >>$ROSETTA_LOG mv $MODEL_DIR/$to_extract.pdb $MODEL$R if [[ $DEBUG == "no" ]] ; then rm $MODEL$R.silent fi else echo "WARNING: Repack failed. Using the original model instead of repacked." cp $MODEL $MODEL$R fi fi if [[ ! -f $MODEL$R.features.proq2.$$.temp ]] ; then echo "------------------- Extracting ProQ2 features -------------------" $ROSETTA_BIN/score.linuxgccrelease -database $ROSETTA_DB -in:file:fullatom -ProQ:basename $MODEL$R -in:file:s $MODEL$R -score:weights ProQ2 -ProQ:output_feature_vector -ignore_zero_occupancy F > $MODEL$R.features.$$.temp sed '/^Outfile:/q' $MODEL$R.features.$$.temp > $MODEL$R.features.$$.temp2 grep ^SVM $MODEL$R.features.$$.temp2 > $MODEL$R.features.proq2.$$.temp if [[ $DEBUG == "no" ]] ; then rm $MODEL$R.features.$$.temp $MODEL$R.features.$$.temp2 fi fi if [[ ! -f $MODEL$R$M ]] ; then echo "------------------- Minimizing energies -------------------" cd $MODEL_DIR /bin/ls $BASE.pdb$R > $BASE.pdb$R.list $ROSETTA_BIN/minimize_with_cst.linuxgccrelease -in:file:l $BASE.pdb$R.list -in:file:fullatom -ddg::out_pdb_prefix minimized_${BASE}_$$ -database $ROSETTA_DB -ddg:min_cst -ignore_zero_occupancy F >>$ROSETTA_LOG # Different Rosetta releases name output files differently. Better be careful and add a long prefix to be sure mv minimized_${BASE}_$$* $BASE.pdb$R$M if [[ $DEBUG == "no" ]] ; then rm $BASE.pdb$R.list fi cd $CURRENT_DIR fi if [[ ! -f $MODEL$R$M.score.$$.temp ]] ; then echo "------------------- Calculating Rosetta energies -------------------" $ROSETTA_BIN/per_residue_energies.linuxgccrelease -in:file:s $MODEL$R$M -database $ROSETTA_DB -out:file:silent $MODEL$R$M.score.$$.temp -score:weights $PROQ3_DIR/weights/talaris2013.wts -ignore_zero_occupancy F >>$ROSETTA_LOG $ROSETTA_BIN/per_residue_energies.linuxgccrelease -in:file:s $MODEL$R$M -database $ROSETTA_DB -out:file:silent $MODEL$R$M.score.$$.temp2 -score:weights $PROQ3_DIR/weights/some_centroid.wts -in:file:residue_type_set centroid -ignore_zero_occupancy F >>$ROSETTA_LOG $ROSETTA_BIN/score.linuxgccrelease -in:file:s $MODEL$R$M -database $ROSETTA_DB -out:file:silent $MODEL$R$M.score.$$.temp3 -score:weights $PROQ3_DIR/weights/global_centroid.wts -in:file:residue_type_set centroid -ignore_zero_occupancy F >>$ROSETTA_LOG fi if [[ ! -f $MODEL$R.proq3.svm ]] ; then echo "------------------- Generating SVM input -------------------" $R_SCRIPT $PROQ3_DIR/ProQ3_scripts/extract-per-residue-features.R $MODEL$R$M.score.$$.temp $MODEL$R$M.features.highres.$$.temp $R_SCRIPT $PROQ3_DIR/ProQ3_scripts/extract-per-residue-centroid-features.R $MODEL$R$M.score.$$.temp2 $MODEL$R$M.features.lowres.$$.temp feat_len=`cat $MODEL$R$M.features.highres.$$.temp | wc -l` cat $MODEL$R$M.score.$$.temp3 | grep "^SCORE:" > $MODEL$R$M.score.$$.temp4 $R_SCRIPT $PROQ3_DIR/ProQ3_scripts/extract-per-residue-centroid-features-global-normalized.R $MODEL$R$M.score.$$.temp4 $MODEL$R$M.features.lowres_global.$$.temp $feat_len $PROQ3_DIR/ProQ3_scripts/svm_to_txt $MODEL$R.features.proq2.$$.temp $MODEL$R.features.proq2.$$.temp2 cut -f 3-176 -d " " $MODEL$R.features.proq2.$$.temp2 > $MODEL$R.features.proq2.$$.temp3 cut -f 1 -d " " $MODEL$R.features.proq2.$$.temp3 > $MODEL$R.features.target.$$.temp cut -f 152-154,155-157,171-172 -d " " $MODEL$R.features.proq2.$$.temp3 > $MODEL$R.features.rsa_ss.$$.temp paste -d " " $MODEL$R.features.target.$$.temp $MODEL$R.features.proq2.$$.temp3 $MODEL$R$M.features.highres.$$.temp $MODEL$R$M.features.lowres.$$.temp $MODEL$R$M.features.lowres_global.$$.temp > $MODEL$R.proq3 paste -d " " $MODEL$R.features.target.$$.temp $MODEL$R.features.proq2.$$.temp3 > $MODEL$R.proq2 paste -d " " $MODEL$R.features.target.$$.temp $MODEL$R.features.rsa_ss.$$.temp $MODEL$R$M.features.highres.$$.temp > $MODEL$R.highres paste -d " " $MODEL$R.features.target.$$.temp $MODEL$R.features.rsa_ss.$$.temp $MODEL$R$M.features.lowres.$$.temp $MODEL$R$M.features.lowres_global.$$.temp > $MODEL$R.lowres for txt_file in $MODEL$R.proq3 $MODEL$R.proq2 $MODEL$R.highres $MODEL$R.lowres ; do $PROQ3_DIR/ProQ3_scripts/txt_to_svm $txt_file $txt_file.svm done if [[ $DEBUG == "no" ]] ; then rm $MODEL$R$M.features.highres.$$.temp $MODEL$R$M.features.lowres.$$.temp $MODEL$R$M.features.lowres_global.$$.temp $MODEL$R.features.proq2.$$.temp $MODEL$R.features.proq2.$$.temp2 $MODEL$R.features.proq2.$$.temp3 $MODEL$R.features.target.$$.temp $MODEL$R$M.score.$$.temp $MODEL$R$M.score.$$.temp2 $MODEL$R$M.score.$$.temp3 $MODEL$R$M.score.$$.temp4 $MODEL$R.features.rsa_ss.$$.temp fi fi if [[ ! -f $GLOBAL_OUT ]] ; then echo "------------------- Making predictions -------------------" if [[ "$DEEP" == "no" ]] ; then $PROQ3_DIR/apps/svm_light/svm_classify $MODEL$R.proq2.svm $PROQ3_DIR/svm_models/ProQ2$R.model $MODEL$R.proq2.pred.$$.temp $PROQ3_DIR/apps/svm_light/svm_classify $MODEL$R.lowres.svm $PROQ3_DIR/svm_models/lowres$R.model $MODEL$R.lowres.pred.$$.temp $PROQ3_DIR/apps/svm_light/svm_classify $MODEL$R.highres.svm $PROQ3_DIR/svm_models/highres$R.model $MODEL$R.highres.pred.$$.temp $PROQ3_DIR/apps/svm_light/svm_classify $MODEL$R.proq3.svm $PROQ3_DIR/svm_models/ProQ3$R.model $MODEL$R.proq3.pred.$$.temp echo "ProQ2 ProQRosCen ProQRosFA ProQ3" > $LOCAL_OUT echo "ProQ2 ProQRosCen ProQRosFA ProQ3" > $GLOBAL_OUT elif [[ "$DEEP" == "yes" ]] ; then KERAS_BACKEND=theano $PYTHON_BIN $PROQ3_DIR/ProQ3_scripts/predict_theano.py $MODEL$R.proq2 $MODEL$R.proq2.pred.$$.temp $PROQ3_DIR/theano_models/casp9-and-casp10${R}.proq2_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_model.json $PROQ3_DIR/theano_models/casp9-and-casp10${R}.proq2_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_weights.h5 KERAS_BACKEND=theano $PYTHON_BIN $PROQ3_DIR/ProQ3_scripts/predict_theano.py $MODEL$R.lowres $MODEL$R.lowres.pred.$$.temp $PROQ3_DIR/theano_models/casp9-and-casp10${R}.lowres_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_model.json $PROQ3_DIR/theano_models/casp9-and-casp10${R}.lowres_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_weights.h5 KERAS_BACKEND=theano $PYTHON_BIN $PROQ3_DIR/ProQ3_scripts/predict_theano.py $MODEL$R.highres $MODEL$R.highres.pred.$$.temp $PROQ3_DIR/theano_models/casp9-and-casp10${R}.highres_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_model.json $PROQ3_DIR/theano_models/casp9-and-casp10${R}.highres_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_weights.h5 KERAS_BACKEND=theano $PYTHON_BIN $PROQ3_DIR/ProQ3_scripts/predict_theano.py $MODEL$R.proq3 $MODEL$R.proq3.pred.$$.temp $PROQ3_DIR/theano_models/casp9-and-casp10${R}.proq3_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_model.json $PROQ3_DIR/theano_models/casp9-and-casp10${R}.proq3_shuffled_ONLY_L2_ADADELTA_model_500_1e-11_weights.h5 echo "ProQ2D ProQRosCenD ProQRosFAD ProQ3D" > $LOCAL_OUT echo "ProQ2D ProQRosCenD ProQRosFAD ProQ3D" > $GLOBAL_OUT else echo "ERROR: invalid --deep parameter" fi paste -d " " $MODEL$R.proq2.pred.$$.temp $MODEL$R.lowres.pred.$$.temp $MODEL$R.highres.pred.$$.temp $MODEL$R.proq3.pred.$$.temp >> $LOCAL_OUT if [[ $TARGET_LENGTH == "" ]] ; then TARGET_LENGTH=`cat $MODEL$R.proq2.pred.$$.temp | wc -l` fi tail -n +2 $LOCAL_OUT | awk -v my_length="$TARGET_LENGTH" '{ sum1 += $1 ; sum2 += $2 ; sum3 += $3 ; sum4 += $4 } END { if (my_length > 0) print sum1 / my_length, sum2 / my_length, sum3 / my_length, sum4 / my_length }' >> $GLOBAL_OUT if [[ $DEBUG == "no" ]] ; then rm $MODEL$R.proq2.pred.$$.temp $MODEL$R.lowres.pred.$$.temp $MODEL$R.highres.pred.$$.temp $MODEL$R.proq3.pred.$$.temp $MODEL$R.proq3 $MODEL$R.proq2 $MODEL$R.highres $MODEL$R.lowres fi fi if [[ "$hetatm" != "0" ]] ; then rm ${MODEL} # remove .pdb.no_hetatm.pdb fi if [[ $KEEP == "no" ]] && [[ $DEBUG == "no" ]] ; then if [[ $R == ".repacked" ]] ; then rm $MODEL$R $MODEL$R$M $MODEL$R.ss2 $MODEL$R.acc $MODEL$R.psi $MODEL$R.mtx fi rm $MODEL$R.proq3.svm $MODEL$R.proq2.svm $MODEL$R.highres.svm $MODEL$R.lowres.svm $MODEL.ss2 $MODEL.acc $MODEL.psi $MODEL.mtx $MODEL.rosetta.log elif [[ "$hetatm" != "0" ]] ; then for i in ${MODEL}* ; do newname=`echo $i | sed -e 's/.no_hetatm.pdb//'` # move .pdb.no_hetatm.pdb.repacked to .pdb.repacked and so on mv $i $newname done fi