#!/usr/bin/perl -w # # ddf # #use lib '/home/bjornw/Research/git/source/perl/'; #use lib '/afs/pdc.kth.se/home/a/arnee/MODULES/perl5/lib/site_perl/5.6.0/i386-linux/'; #BEGIN { # push(@INC,'/users/bjornw/lib/bioperl/'); # push(@INC,'/work/bjornw/lib/bioperl/'); # push(@INC,'/Users/bjorn/Research/lib/bioperl/'); # push(@INC,'/sw/lib/perl5/'); #} #print join("\n",@INC); #use Bio::Ext::Align; use Bio::Pdb; use Bio::AlignIO; use Bio::SimpleAlign; #use Bio::Tools::pSW; use Bio::LocatableSeq; use Bio::Seq; use File::Temp qw/ tempfile /; sub get_Bfactor { my $input_pdb=shift; my %b=(); open(IN,$input_pdb); my $old_resnum="undef"; while() { chomp; if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); my $resnum=substr($_,21,5); $resnum=~s/\s+//g; #print "$resnum $old_resnum $atomtype\n"; if($atomtype=~/CA/ && $old_resnum ne $resnum) { # $res=substr($_,17, 3); # print $table{$res}; my $b=substr($_,60,6); $b=~s/\s+//g; $b{$resnum}=$b; # push(@b,$b); $old_resnum=$resnum; } } } return(%b); } sub get_residues_with_CA { my $pdbfile=shift; my $old_resname="undef"; #my $old_alt_loc="undef"; my $residue=""; my $new_pdb=""; #my $skip=0; open(PDBFILE,"$pdbfile") || die "Cannot open $pdbfile\n"; while() { chomp; if(/^ATOM/) { my $alt_loc=substr($_,16,1); my $resno=substr($_, 22, 4); my $insertion_code=substr($_,26,1); my $resname="$resno$insertion_code"; if($resname ne $old_resname && $old_resname ne "undef") #new residue { if($residue=~/CA/) { $new_pdb.=$residue; } $residue=""; } if($alt_loc eq "A" || $alt_loc eq " ") { substr($_,16,1)=" "; $residue.=$_."\n"; } $old_resname=$resname; } } if($residue=~/CA/) { $new_pdb.=$residue; } # if(not($new_pdb=~/TER/)) # { # $new_pdb.="TER\n"; #} # $new_pdb.="TER\nEND\n"; close(PDBFILE); return $new_pdb; } sub align # Takes two strings removes all dashes and returns the alignment. { my ($seq1,$seq2)=@_; my $needle_linux="/usr/bin/needle"; my $needle_mac="/opt/local/bin/needle"; my $osname = $^O; my $input1=$seq1; my $input2=$seq2; $seq1=~s/-//g; $seq2=~s/-//g; #$seq1=remove_dashes($seq1); #$seq2=remove_dashes($seq2); $seq1=~s/\n//g; $seq2=~s/\n//g; $seq1=~s/\s+//g; $seq2=~s/\s+//g; my ($fh1,$file1)=tempfile("/tmp/seq.XXXXXXXX"); my ($fh2,$file2)=tempfile("/tmp/seq.XXXXXXXX"); my ($fh3,$file3)=tempfile("/tmp/ali.XXXXXXXX"); close($fh3); print $fh1 ">seq1\n$seq1\n"; close($fh1); print $fh2 ">seq2\n$seq2\n"; close($fh2); if($osname eq "linux" && -e $needle_linux) { $needle=$needle_linux; } if($osname eq "darwin" && -e $needle_mac) { $needle=$needle_mac; } #print "needle -aseq $file1 -bseq $file2 -gapopen 10 -gapextend 0.5 -outfile $file3\n"; `needle -aseq $file1 -bseq $file2 -gapopen 1 -gapextend 0.5 -outfile $file3 > /dev/null 2>&1`; #print $file3."\n"; ($ali_return1,$ali_return2)=parse_needle_output($file3); `rm $file1 $file2 $file3`; # if(length($seq1)==1 || length($seq2)==1) # { # my $len1=length($seq1); # my $len2=length($seq2); # my $len3=length($input1); # my $len4=length($input2); # if($len1==1) # { # $dashes=length($input2)-length($input1); # $ali_return1=$input1._dashes($dashes); # $ali_return2=$input2; # } # if($len2==1) # { # $dashes=length($input1)-length($input2); # $ali_return1=$input1; # $ali_return2=$input2._dashes($dashes); # } # return ($ali_return1,$ali_return2) # } # # # else #print "$ali_return1\n$ali_return2\n"; return ($ali_return1,$ali_return2); } sub parse_needle_output { my $file=shift; my $seq1=""; my $seq2=""; my $header1=""; my $header2=""; my $isFirst=1; open(FILE,$file); while(){ next if (/^#/); if(/^(.{13})(.{6}\d)\ ([\w\-]+)/){ # print "header:$1, seqnro:$2, seq:$3|\n"; my $header = $1; my $seq = $3; if ($isFirst){ $seq1.=$seq; $header1 = $header; $isFirst=0; } else { $seq2.=$seq; $header2 = $header; $isFirst=1; } } } close(FILE); if(length($seq1) == 0) { print STDERR "needle from the EMBOSS package needs to be installed.\n"; } return($seq1,$seq2); } sub align_legacy # Takes two strings removes all dashes and returns the alignment. { my ($seq1,$seq2)=@_; my $input1=$seq1; my $input2=$seq2; $seq1=~s/-//g; $seq2=~s/-//g; #$seq1=remove_dashes($seq1); #$seq2=remove_dashes($seq2); $seq1=~s/\n//g; $seq2=~s/\n//g; $seq1=~s/\s+//g; $seq2=~s/\s+//g; # if(length($seq1)==1 || length($seq2)==1) # { # my $len1=length($seq1); # my $len2=length($seq2); # my $len3=length($input1); # my $len4=length($input2); # if($len1==1) # { # $dashes=length($input2)-length($input1); # $ali_return1=$input1._dashes($dashes); # $ali_return2=$input2; # } # if($len2==1) # { # $dashes=length($input1)-length($input2); # $ali_return1=$input1; # $ali_return2=$input2._dashes($dashes); # } # return ($ali_return1,$ali_return2) # } # # # else # { my ($ali_return1,$ali_return2); my $factory=new Bio::Tools::pSW('-matrix' => '/Users/bjorn/Research/lib/bioperl-ext/Bio/Ext/Align/blosum62.bla','-gap' => 1,'-ext' => 0); my $seq_obj1=Bio::Seq->new(-moltype => 'protein', -seq => $seq1, -id => "seq1"); my $seq_obj2=Bio::Seq->new(-moltype => 'protein', -seq => $seq2, -id => "seq2"); my $aln = $factory->pairwise_alignment($seq_obj1,$seq_obj2); #my $alnout = new Bio::AlignIO(-format => 'fasta', # -fh => \*STDOUT); #$alnout->write_aln($aln); # print "align\n"; # print $aln."\n"; # foreach my $tmp(keys(%{$aln})) # { # print $tmp."\n"; # print $aln{$tmp}; # } # print $seq_obj2->seq(); # print "\n"; # print $aln,"\n"; #my $nice_ali=$factory->align_and_show($seq_obj1,$seq_obj2,*STDOUT); ($ali_return1,$ali_return2)=fix_alignment($aln,$seq_obj1,$seq_obj2); #print "$ali_return1\n$ali_return2\n"; return ($ali_return1,$ali_return2); } sub align_id # Takes two strings removes all dashes and returns the alignment. { my ($seq1,$seq2)=@_; my $input1=$seq1; my $input2=$seq2; $seq1=~s/-//g; $seq2=~s/-//g; #$seq1=remove_dashes($seq1); #$seq2=remove_dashes($seq2); $seq1=~s/\n//g; $seq2=~s/\n//g; # if(length($seq1)==1 || length($seq2)==1) # { # my $len1=length($seq1); # my $len2=length($seq2); # my $len3=length($input1); # my $len4=length($input2); # if($len1==1) # { # $dashes=length($input2)-length($input1); # $ali_return1=$input1._dashes($dashes); # $ali_return2=$input2; # } # if($len2==1) # { # $dashes=length($input1)-length($input2); # $ali_return1=$input1; # $ali_return2=$input2._dashes($dashes); # } # return ($ali_return1,$ali_return2) # } # # # else # { my ($ali_return1,$ali_return2); my $factory=new Bio::Tools::pSW('-matrix' => '~/lib/bioperl/Bio/Ext/Align/indent.bla','-gap' => 1,'-ext' => 0); my $seq_obj1=Bio::Seq->new(-moltype => 'protein', -seq => $seq1, -id => "seq1"); my $seq_obj2=Bio::Seq->new(-moltype => 'protein', -seq => $seq2, -id => "seq2"); my $aln = $factory->pairwise_alignment($seq_obj1,$seq_obj2); #$factory->align_and_show($seq_obj1,$seq_obj2,*STDOUT); ($ali_return1,$ali_return2)=fix_alignment($aln,$seq_obj1,$seq_obj2); return ($ali_return1,$ali_return2); } sub alignseq # Takes two strings removes all dashes and returns the alignment. { my ($seq1,$seq2)=@_; $seq1=remove_dashes($seq1); $seq2=remove_dashes($seq2); my ($ali_return1,$ali_return2); my $factory=new Bio::Tools::pSW('-matrix' => '/Users/bjorn/Research/lib/bioperl-ext/Bio/Ext/Align/blosum62.bla','-gap' => 12,'-ext' => 4); my $seq_obj1=Bio::Seq->new(-moltype => 'protein', -seq => $seq1, -id => "seq1"); my $seq_obj2=Bio::Seq->new(-moltype => 'protein', -seq => $seq2, -id => "seq2"); my $aln = $factory->pairwise_alignment($seq_obj1,$seq_obj2); #$factory->align_and_show($seq_obj1,$seq_obj2,*STDOUT); ($ali_return1,$ali_return2)=fix_alignment($aln,$seq_obj1,$seq_obj2); #$seq_obj1->DESTROY; #$seq_obj2->DESTROY; #$factory->DESTROY; #delete($seq_obj1); #delete($seq_obj2); #delete($factory); return ($ali_return1,$ali_return2); } sub remove_ends { #$ali2 has to be the pdbfile. my ($ali1,$ali2)=@_; my $start=0; my $end=0; my $find_start=1; my $find_end=0; my @ali1 = split('',$ali1); my @ali2 = split('',$ali2); my $ali_return1=$ali1; my $ali_return2=$ali2; my @seq_2_ali=(); for(my $i=0;$ieach_seq()) { #print $seq->seq()."\n"; $start[$i]=$seq->start(); $end[$i]=$seq->end(); $ali_seqs[$i]=$seq->seq(); #print $seq."\n"; #print $i."\n"; #print $ali_seqs[$i]."\n"; #print "$start[$i] $end[$i]\n"; $i++; } #print $start[0]."\n".$start[1]."\n"; # Reformat alignment so that it contain all resides and -. # print "$ali_seqs[0]\n$ali_seqs[1]\n"; # exit; # Fix the begining if($start[0]!=1) { $ali_seq1.=$seq1->subseq(1,$start[0]-1); $ali_seq2.=_dashes($start[0]-1); } if($start[1]!=1) { $ali_seq1.=_dashes($start[1]-1); $ali_seq2.=$seq2->subseq(1,$start[1]-1); } # Add the alignment $ali_seq1.=$ali_seqs[0]; $ali_seq2.=$ali_seqs[1]; # Fix alignment end; # print "$end[0] ".$seq1->length()."\n"; if($end[0]<$seq1->length()) { my $len=$seq1->length(); $ali_seq1.=$seq1->subseq($end[0]+1,$len); $ali_seq2.=_dashes($len-$end[0]); } #print "$end[1] ".$seq2->length()."\n"; if($end[1]<$seq2->length()) { my $len=$seq2->length(); $ali_seq1.=_dashes($len-$end[1]); $ali_seq2.=$seq2->subseq($end[1]+1,$len); } return ($ali_seq1,$ali_seq2); } sub _stars { my $number=shift; my $str=""; for(my $i=0;$i<$number;$i++) { $str.="*"; } return $str; } sub _dashes { my $number=shift; my $str=""; for(my $i=0;$i<$number;$i++) { $str.="-"; } return $str; } sub merge_multi_ali { # print "In merger\n"; #my (@target_ali_list,@template_ali_list)=@_; my $temp=scalar @_; #The first half of the vector is target my @target_ali_list=@_[0..$temp/2-1]; #the second is template. my @template_ali_list=@_[$temp/2..$temp-1]; #print @target_ali_list; #print "\n"; #print @template_ali_list; #print "\n"; my $target_return=""; my @template_return=(); my $res_count=0; my @res_found=(); my $number_of_seq=scalar @target_ali_list; my @index_vec=(); my $total_ali_length=0; my @temp_str=(); # Sort the target sequences so the one with the most number of dashes in the begin is first. # Remember to put them in the same original order again, when done!!! for(my $i=0;$i<$number_of_seq;$i++) { #print "$template_ali_list[$i]\n\n$target_ali_list[$i]\n\n"; $temp_str[$i]=count_begin_dashes($target_ali_list[$i])." ".$i." ".$template_ali_list[$i]." ".$target_ali_list[$i]." "; } @temp_str=sort numerically_str @temp_str; my @old_index=(); my $trash=""; #Initialize the vectors. And split the sequences. for(my $i=0;$i<$number_of_seq;$i++) { ($trash,$old_index[$i],$template_ali_list[$i],$target_ali_list[$i])=split(/\s+/,$temp_str[$i]); push(@index_vec,0); push(@res_found,0); #push(@target_return,""); push(@template_return,""); $total_ali_length+=length $target_ali_list[$i];# if(length $target_ali_list[$i]>$longest_ali); #print $target_ali_list[$i]."\n\n".$template_ali_list[$i]."\n\n"; $target_ali_list[$i]=[split('',$target_ali_list[$i])]; $template_ali_list[$i]=[split('',$template_ali_list[$i])]; } #exit; if($number_of_seq>0) { while(sum(@index_vec)<$total_ali_length) { #print $total_ali_length."\n"; #print sum(@index_vec)."\n"; #print sum(@res_found)."\n"; for(my $i=0;$i<$number_of_seq;$i++) { my $summa=sum(@res_found); my $summa2=sum(@index_vec); #print "@res_found $number_of_seq $summa $summa2 $total_ali_length $index_vec[$i] $target_ali_list[$i][$index_vec[$i]] $template_ali_list[$i][$index_vec[$i]]\n"; if(sum(@res_found) == $number_of_seq) #all @index_vec points a target reside align or not does not matter print 'em all.. { if(defined($target_ali_list[0][$index_vec[0]])) { $target_return.=$target_ali_list[0][$index_vec[0]]; } else { $target_return.="-"; } #print $target_return."\n"; for(my $j=0;$j<$number_of_seq;$j++) { if(defined($template_ali_list[$j][$index_vec[$j]])) { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; } else { $template_return[$j].="-"; } $res_found[$j]=0; $index_vec[$j]++; } } elsif(defined($target_ali_list[$i][$index_vec[$i]]) && defined($template_ali_list[$i][$index_vec[$i]]) && $target_ali_list[$i][$index_vec[$i]] eq "-" && $template_ali_list[$i][$index_vec[$i]] ne "-") # unaligned template res. Add that res # to return and "-" to all others which are not unaligned. { for(my $j=0;$j<$number_of_seq;$j++) { if(defined($template_ali_list[$j][$index_vec[$j]]) && $i==$j) { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; } elsif(defined($template_ali_list[$j][$index_vec[$j]]) && defined($target_ali_list[$j][$index_vec[$j]]) && $target_ali_list[$j][$index_vec[$j]] eq "-") #if unaligned res { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; $index_vec[$j]++; } else { $template_return[$j].="-"; } } $target_return.="-"; $index_vec[$i]++; } elsif(defined($target_ali_list[$i][$index_vec[$i]]) && $target_ali_list[$i][$index_vec[$i]] ne "-") #target residue { $res_found[$i]=1; } } } } # Put the sequences back in order they came in. my @template_return2=(); for(my $i=0;$i$longest_ali); print $target_ali_list[$i]."\n\n".$template_ali_list[$i]."\n\n"; $target_ali_list[$i]=[split('',$target_ali_list[$i])]; $template_ali_list[$i]=[split('',$template_ali_list[$i])]; } #exit; if($number_of_seq>0) { while(sum(@index_vec)<$total_ali_length) { #print $total_ali_length."\n"; #print sum(@index_vec)."\n"; #print sum(@res_found)."\n"; for(my $i=0;$i<$number_of_seq;$i++) { my $summa=sum(@res_found); my $summa2=sum(@index_vec); #print "@res_found $number_of_seq $summa $summa2 $total_ali_length $index_vec[$i] $target_ali_list[$i][$index_vec[$i]] $template_ali_list[$i][$index_vec[$i]]\n"; if(sum(@res_found) == $number_of_seq) #all @index_vec points a target reside align or not does not matter print 'em all.. { if(defined($target_ali_list[0][$index_vec[0]])) { $target_return.=$target_ali_list[0][$index_vec[0]]; } else { $target_return.="-"; } # print $target_return."\n"; for(my $j=0;$j<$number_of_seq;$j++) { if(defined($template_ali_list[$j][$index_vec[$j]])) { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; } else { $template_return[$j].="-"; } $res_found[$j]=0; $index_vec[$j]++; } } elsif(defined($target_ali_list[$i][$index_vec[$i]]) && defined($template_ali_list[$i][$index_vec[$i]]) && $target_ali_list[$i][$index_vec[$i]] eq "-" && $template_ali_list[$i][$index_vec[$i]] ne "-") # unaligned template res. Add that res # to return and "-" to all others which are not unaligned. { for(my $j=0;$j<$number_of_seq;$j++) { if(defined($template_ali_list[$j][$index_vec[$j]]) && $i==$j) { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; } elsif(defined($template_ali_list[$j][$index_vec[$j]]) && defined($target_ali_list[$j][$index_vec[$j]]) && $target_ali_list[$j][$index_vec[$j]] eq "-") #if unaligned res { $template_return[$j].=$template_ali_list[$j][$index_vec[$j]]; $index_vec[$j]++; } else { $template_return[$j].="-"; } } $target_return.="-"; $index_vec[$i]++; } elsif(defined($target_ali_list[$i][$index_vec[$i]]) && $target_ali_list[$i][$index_vec[$i]] ne "-") #target residue { $res_found[$i]=1; } } } } # Put the sequences back in order they came in. my @template_return2=(); for(my $i=0;$i0) # { for(my $j=0;$j=$ranges[$i] && ${$x[$j]}[$k]<$ranges[$i+1]); #print "${$x[$j]}[$k] $ranges[$i+1] i=$i j=$j k=$k\n";# if(not(defined($ranges[$i+1]))); } } #print "\n" if($j==0); } #exit; } if(defined($frac) && $frac==1) { $len=scalar @x; #print $len,"\n"; for(my $i=0;$i$a; } sub numerically_inc { $a<=>$b } sub sum { my @vec=@_; my $sum=0; foreach my $term(@vec) { #print $term,"\n"; $sum+=$term; } return $sum; } sub max { my @vec=@_; my $max=$vec[0]; foreach my $term(@vec) { if($term>$max) { $max=$term; } # print $term,"\n"; } return $max; } sub min { my @vec=@_; my $min=$vec[0]; foreach my $term(@vec) { if($term<$min) { $min=$term; } #print $term,"\n"; } return $min; } sub shift_score { my @vec=@_; my $sum=0; foreach my $term(@vec) { #print $term,"\n"; if($term ne "?") { $sum+=(1/(1+abs($term))); } } return $sum; } sub count_begin_dashes { my $str=shift; my @temp=split('',$str); my $counter=0; for(my $i=0;$i$temp_a[0]; } #####my $special_restraints=generate_top_code($ss_pred,$target_ali_new,$target_seq); sub generate_top_code { #Assumes that the $ali_seq is a subset of $target_seq; my ($pred,$ali_seq,$target_seq)=@_; my($ali_seq2,$target_seq2)=align($ali_seq,$target_seq); #print "$ali_seq2\n\n$target_seq2\n\n$pred\n\n"; #$ali_seq2.='-'; my @ali_seq=split('',$ali_seq2); my ($begin,$end,$start,$stop)=(0,0,0,0); my $get_stop=0; #my @target_seq=split('',$target_seq2); for(my $i=0;$i$start-1) { if($last_pred eq "H") #print alpha restraints { $end=$j-1; $alpha.=" MAKE_RESTRAINTS RESTRAINT_TYPE = 'ALPHA', RESIDUE_IDS = '$begin' '$end'\n"; } elsif($last_pred eq "E") #print strand restraints { $end=$j-1; $strand.=" MAKE_RESTRAINTS RESTRAINT_TYPE = 'STRAND',RESIDUE_IDS = '$begin' '$end'\n"; } } if($pred[$i] eq "H" && $last_pred ne "H") #new helix { $begin=$j; } elsif($pred[$i] eq "E" && $last_pred ne "E") #new strand { $begin=$j; } $last_pred=$pred[$i]; } $j--; if($last_pred eq "H" && $begin>$end) #print alpha restraints, second check means that a new begin must have been added since last time.. { $end=$j-1; $alpha.=" MAKE_RESTRAINTS RESTRAINT_TYPE = 'ALPHA', RESIDUE_IDS = '$begin' '$end'\n"; } elsif($last_pred eq "E" && $begin>$end) #print strand restraints { $end=$j-1; $strand.=" MAKE_RESTRAINTS RESTRAINT_TYPE = 'STRAND',RESIDUE_IDS = '$begin' '$end'\n"; } my $return_str="SUBROUTINE ROUTINE = 'special_restraints'\n SET ADD_RESTRAINTS = on\n$alpha$strand"; $return_str.=" RETURN\nEND_SUBROUTINE\n"; #print $return_str; return $return_str; } sub get_start_number { my ($seq1,$seq2)=@_; my ($target_seq,$model_seq)=align($seq1,$seq2); #align_loc($seq1,$seq2); my @res=split(//,$model_seq); my $start=1; for(my $i=0;$i) { my $line=$_; if($line=~/^ATOM/) { my $chain=substr($line,21,1); my $resno=substr($line, 22, 4); if($oldresno ne $resno) { $number++; if($old_chain ne $chain) { $number=$in_number; # $new_file.="TER\n"; } $temp=sprintf("%4d",$number); } substr($line,6,5)=sprintf("%5d",$atomcount); $atomcount++; substr($line, 22, 4)=$temp; substr($line, 26, 1)=" "; $oldresno=$resno; $old_chain=$chain; } $new_file.=$line; } return $new_file; } sub read_in_psipred { my $file=shift; #my $target_seq=shift; my $seq=""; my $ss_pred=""; open(FILE,"$file");# or�die "Cannot open $file.\n"; while() { if(/^Pred:\s[HCE]+/) { my @temp=split(/\s+/); $ss_pred.=$temp[1] if(scalar @temp >=2); } elsif(/^ AA:/) { my @temp=split(/\s+/); $seq.=$temp[2] if(defined($temp[2])); } } close(FILE); #print length($seq)," ",length($ss_pred),"\n"; #print $file,"\n"; if(length($seq)==0 && length($ss_pred)==0) { #print $file,"\n"; open(FILE,"$file");# or�die "Cannot open $file.\n"; while() { if(/\s+\d+/) { chomp; @temp=split(/\s+/); $ss_pred.=$temp[3]; $seq.=$temp[2]; } } } close(FILE); #print "$seq\n$ss_pred\n"; #exit; #generate_top_code($ss_pred,10,110); return ($seq,$ss_pred); } sub get_ss { my ($seq,$seq_pred,$pred)=@_; my $return_ss=""; #my ($seq_pred,$pred)=read_in_psipred($psifile); my ($ali_real,$ali_pred)=align($seq,$seq_pred); # print "$ali_real\n$ali_pred\n";#$pred\n"; #print "------------\n"; my @ali_real=split(//,$ali_real); my @ali_pred=split(//,$ali_pred); my @pred=split(//,$pred); my $ss_pred_ali=""; my $i=0; foreach my $residue(@ali_pred) { if($residue eq '-') { $ss_pred_ali.="-"; } else { $ss_pred_ali.=$pred[$i]; $i++; } } my @ss_pred_ali=split(//,$ss_pred_ali); for(my $i=0;$i<=$#ali_real;$i++) { if($ali_real[$i] ne '-') { $return_ss.=$ss_pred_ali[$i]; } } #print $return_ss."\n"; return $return_ss; } sub get_rsa { my ($seq,$seq_pred,$pred)=@_; my @pred=@{$pred}; my @return_rsa=(); #my ($seq_pred,$pred)=read_in_psipred($psifile); my ($ali_real,$ali_pred)=align($seq,$seq_pred); #print "$ali_real\n$ali_pred\n";#$pred\n"; #print "------------\n"; my @ali_real=split(//,$ali_real); my @ali_pred=split(//,$ali_pred); # my @pred=split(//,$pred); my @ss_pred_ali=(); my $i=0; foreach my $residue(@ali_pred) { if($residue eq '-') { push(@ss_pred_ali,"-"); } else { push(@ss_pred_ali,$pred[$i]); $i++; } } #my @ss_pred_ali=split(//,$ss_pred_ali); for(my $i=0;$i<=$#ali_real;$i++) { if($ali_real[$i] ne '-') { push(@return_rsa,$ss_pred_ali[$i]); } } #print $return_ss."\n"; return @return_rsa; } sub read_in_stride { my $file=shift; my $seq=""; my $ss=""; open(FILE,"$file") || die "Cannot open $file. (bjornlib)\n"; while() { chomp; if(/^SEQ/) { #print substr($_,10,50)."\n"; $seq.=substr($_,10,50); } if(/^STR/) { #print substr($_,10,50)."\n"; $ss.=substr($_,10,50); } last if(/^LOC/); } #Remove white spaces at end. $seq=~s/ //g; $ss=substr($ss,0,length($seq)); my @temp=split(//,$ss); my $return_ss=""; foreach my $ss_a(@temp) { #print $ss_a."\n"; if($ss_a eq 'H' || $ss_a eq 'G' || $ss_a eq 'I') { $return_ss.="H"; } elsif($ss_a eq "E") { $return_ss.="E"; } else { $return_ss.="C"; } } #print "$seq\n$return_ss\n$ss\n"; return($seq,$return_ss); } sub read_in_stride_ext { my $file=shift; my $seq=""; my $ss=""; open(FILE,"$file") || die "Cannot open $file. (bjornlib)\n"; while() { chomp; if(/^SEQ/) { #print substr($_,10,50)."\n"; $seq.=substr($_,10,50); } if(/^STR/) { #print substr($_,10,50)."\n"; $ss.=substr($_,10,50); } last if(/^LOC/); } #Remove white spaces at end. $seq=~s/ //g; $ss=substr($ss,0,length($seq)); $ss=~s/ /C/g; $ss=~s/T/C/g; return($seq,$ss); } sub get_pdb { my $pdbcode=shift; $pdbcode=lc($pdbcode); #print STDERR "PDBCODE to get_pdb $pdbcode\n"; my $PDBURL="ftp://ftp.rcsb.org/pub/pdb/data/structures/all/pdb/"; my $OBSOLETE_PDBURL="ftp://ftp.rcsb.org/pub/pdb/data/structures/obsolete/pdb/"; my $MODEL_PDBURL="ftp://ftp.rcsb.org/pub/pdb/data/structures/models/current/pdb/"; my $PDBDIR="/afs/pdc.kth.se/projects/sbc/mirror/mirrors/pdb/"; my $PDBDIR_OBS="/afs/pdc.kth.se/projects/sbc/mirror/mirrors/pdb/obsolete/"; my $PDBDIR_MODELS="/afs/pdc.kth.se/projects/sbc/mirror/mirrors/pdb/models/current/pdb/"; my $PDBDIR_MODELS_OBS="/afs/pdc.kth.se/projects/sbc/mirror/mirrors/pdb/models/obsolete/pdb/"; my $SCOPDIR="/afs/pdc.kth.se/home/a/arnee/structpredict/SCOP/scop-1.63/pdb/"; my $SCOPDIR2="/afs/pdc.kth.se/home/a/arnee/structpredict/SCOP/scop-1.57/pdb/"; if(length($pdbcode)<6) { my $file="pdb$pdbcode.ent"; my $subdir=lc(substr($pdbcode,1,2)); my $return_file="";; if(-e $file) { return $file; } #print "Trying afs directories\n"; if(-e "$PDBDIR$subdir/$file") { $return_file="$PDBDIR$subdir/$file"; #`cp $PDBDIR$subdir/$file .`; } if(!-e "$return_file" && -e "$PDBDIR_OBS$subdir/$file") { $return_file="$PDBDIR_OBS$subdir/$file"; #`cp $PDBDIR_OBS$subdir/$file .`; } if(!-e "$return_file" && -e "$PDBDIR_MODELS$subdir/$file") { $return_file="$PDBDIR_MODELS$subdir/$file"; #`cp $PDBDIR_MODELS$subdir/$file .`; } if(!-e "$return_file" && -e "$PDBDIR_MODELS_OBS$subdir/$file") { $return_file="$PDBDIR_MODELS_OBS$subdir/$file"; #`cp $PDBDIR_MODELS$subdir/$file .`; } if(!-e "$return_file") { my $pwd=`pwd`; print $pwd."\n"; print "Trying ordinary URL $file.Z\n"; my $count=0; while(!-e "$file.Z" && $count<3) { print "Try $count: wget $PDBURL$file.Z\n"; `wget $PDBURL$file.Z`; $count++; } if(-e "$file.Z") { `gunzip -f $file.Z`; } } if(!-e "$file" && !-e "$return_file") { #print "Trying theoretical models URL\n"; my $pdbfile_compressed=$file.".Z"; my $pdbfile_dir="$subdir/$pdbfile_compressed"; $count=0; while(not(-e "$file.Z") && $count<3) { print "Try $count: $MODEL_PDBURL$pdbfile_dir\n"; `wget $MODEL_PDBURL$pdbfile_dir`; $count++; } if(-e "$file.Z") { `gunzip -f $file.Z`; } } if(!-e "$file" && !-e "$return_file") { #print "Trying obsolete URL\n"; #my $subdir=substr($pdb_code,1,2); $subdir.="/"; my $pdbfile_compressed=$file.".Z"; my $pdbfile_obsolete=$subdir.$pdbfile_compressed; $count=0; while(not(-e "$file.Z") && $count<3) { # print "Try $count: $OBSOLETE_PDBURL$pdbfile_obsolete\n"; `wget $OBSOLETE_PDBURL$pdbfile_obsolete`; $count++; } if(-e "$file.Z") { `gunzip -f $file.Z`; } } if(-e "$file") { return $file; } elsif(-e "$return_file") { return $return_file; } else { return 0; } } else #SCOP FILE { if(length($pdbcode)==7) { #print "egrep $pdbcode /afs/pdc.kth.se/home/a/arnee/structpredict/SCOP/scop-1.63/data/dir.cla.scop.txt_1.63|head -n 1|awk '{print \$4}'\n"; my $str=`egrep $pdbcode /afs/pdc.kth.se/home/a/arnee/structpredict/SCOP/scop-1.63/data/dir.cla.scop.txt_1.63|head -n 1|awk '{print \$4}'`; chomp($str); #print "STR $str\n"; $pdbcode.=".$str"; } my $subdir=substr($pdbcode,8,1); #print $pdbcode."\n"; my $file=$SCOPDIR.$subdir."/$pdbcode.pdb"; my $file2=$SCOPDIR2.$subdir."/$pdbcode.pdb"; #print $file."\n"; if(-e $file) { return $file } elsif(-e $file2) { return $file2; } else { return 0; } } } sub seq_ok { my $seq1=shift; my $seq2=shift; $seq1=~s/-//g; $seq1=~s/A//g; $seq1=~s/C//g; $seq1=~s/G//g; $seq1=~s/T//g; $seq2=~s/-//g; $seq2=~s/A//g; $seq2=~s/C//g; $seq2=~s/G//g; $seq2=~s/T//g; my $bool=1; $bool=0 if(length($seq1)<2 || length($seq1)<2); return $bool; } sub id { my ($seq1,$seq2)=@_; my ($ali1,$ali2)=alignseq($seq1,$seq2); my $shortest_length=length($seq1); $shortest_length=length($seq2) if($shortest_length>length($seq2)); #print "$ali1\n$ali2\n"; my @ali1=split(//,$ali1); my @ali2=split(//,$ali2); my $id_residues=0; for(my $i=0;$ilength($seq2)); #print "$ali1\n$ali2\n"; my @ali1=split(//,$ali1); my @ali2=split(//,$ali2); my $id_residues=0; for(my $i=0;$ilength($seq2)); #print "$ali1\n$ali2\n"; my @ali1=split(//,$ali1); my @ali2=split(//,$ali2); my $id_residues=0; my $len1=0; my $len2=0; for(my $i=0;$i; my $out=join('',@out); return $out; } sub aa321CA { my $file=shift; my $seq=""; my $old_resnum="whatever"; open(PDB,"$file"); while() { if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); my $resnum=substr($_,21,6); $resnum=~s/\s+//g; #print "$resnum $old_resnum $atomtype\n"; if($atomtype=~/CA/ && $old_resnum ne $resnum) { $res=substr($_,17, 3); $seq.=aa321($res); # print $table{$res}; $old_resnum=$resnum; } } last if(/^ENDMDL/); } close(PDB); # print "\n"; return $seq; } sub aa321ANY { my $file=shift; my $seq=""; my $old_resnum="whatever"; open(PDB,"$file"); while() { if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); my $resnum=substr($_,21,6); $resnum=~s/\s+//g; #print "$resnum $old_resnum $atomtype\n"; if($old_resnum ne $resnum) { $res=substr($_,17, 3); $seq.=aa321($res); # print $table{$res}; $old_resnum=$resnum; } } last if(/^ENDMDL/); } close(PDB); # print "\n"; return $seq; } sub aa321_resnum { my $file=shift; my $seq=""; my $old_resnum="whatever"; my @resnum=(); open(PDB,"$file"); while() { if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); my $resnum=substr($_,22,5); # $resnum=~s/\s+//g; #print "$resnum $old_resnum $atomtype\n"; if($atomtype=~/CA/ && $old_resnum ne $resnum) { $res=substr($_,17, 3); $seq.=aa321($res); # print $table{$res}; push(@resnum,$resnum); $old_resnum=$resnum; } } last if(/^ENDMDL/); } close(PDB); # print "\n"; return ($seq,\@resnum); } sub parse_TMscore { my $file=shift; my %data=(); open(TM,$file); while() { if(/^RMSD/) { my @t=split(/\s+/); $data{'rmsd'}=$t[5]; } if(/^TM-score/) { my @t=split(/\s+/); $data{'TM'}=$t[2]; } if(/^MaxSub-score/) { my @t=split(/\s+/); $data{'MX'}=$t[1]; } if(/^GDT-TS/) { my @t=split(/\s+/); $data{'GDT'}=$t[1]; } if(/^GDT-HA/) { my @t=split(/\s+/); $data{'GDT_HA'}=$t[1]; } if(/^Length=\s+(\d+)/) { $data{'GDT_HA'}=$1; } } close(TM); return(%data); } sub aa321_resnumANY { my $file=shift; my $seq=""; my $old_resnum="whatever"; my @resnum=(); open(PDB,"$file"); while() { if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); my $resnum=substr($_,22,5); # $resnum=~s/\s+//g; #print "$resnum $old_resnum $atomtype\n"; if($old_resnum ne $resnum) { $res=substr($_,17, 3); $seq.=aa321($res); # print $table{$res}; push(@resnum,$resnum); $old_resnum=$resnum; } } last if(/^ENDMDL/); } close(PDB); # print "\n"; return ($seq,\@resnum); } sub aa321CA_legacy { my $file=shift; my $seq=""; open(PDB,"$file"); while() { if(/^ATOM/) { my $atomno=substr($_, 7, 4); my $atomtype=substr($_, 13, 3); if($atomtype=~/CA/) { $res=substr($_,17, 3); $seq.=aa321($res); } } last if(/^ENDMDL/); } close(PDB); # print "\n"; return $seq; } sub aa321 { my $aa=shift; my %aa321=('ALA', 'A', 'ARG', 'R', 'ASN', 'N', 'ASP', 'D', 'CYS', 'C', 'GLN', 'Q', 'GLU', 'E', 'GLY', 'G', 'HIS', 'H', 'ILE', 'I', 'LEU', 'L', 'LYS', 'K', 'MET', 'M', 'PHE', 'F', 'PRO', 'P', 'SER', 'S', 'THR', 'T', 'TRP', 'W', 'TYR', 'Y', 'VAL', 'V', 'ASX', 'B', 'GLX', 'Z', 'XXX', 'A', 'MSE', 'M', 'FME', 'M', 'PCA', 'E', '5HP', 'E', 'SAC', 'S', 'CCS', 'C'); my $aa1=0; $aa1=$aa321{$aa} if(defined($aa321{$aa})); return($aa1); } sub aa123 { my $aa=shift; my %aa123=('A','ALA', 'R','ARG', 'N','ASN', 'D','ASP', 'C','CYS', 'Q','GLN', 'E','GLU', 'G','GLY', 'H','HIS', 'I','ILE', 'L','LEU', 'K','LYS', 'M','MET', 'F','PHE', 'P','PRO', 'S','SER', 'T','THR', 'W','TRP', 'Y','TYR', 'V','VAL', 'B','ASX', 'Z','GLX'); my $aa3=0; if(defined($aa123{$aa})) { $aa3=$aa123{$aa} } else { print STDERR "\"$aa\" is not defined!\n"; } return($aa3); } sub sparse_encode { my $seq=shift; #print $seq."\n"; my @seq=split(//,$seq); my $code=""; my $output=""; my %aaspe =('A', "1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'R', "0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'N', "0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'D', "0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'C', "0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'Q', "0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'E', "0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'G', "0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ", 'H', "0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 ", 'I', "0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ", 'L', "0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ", 'K', "0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 ", 'M', "0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ", 'F', "0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ", 'P', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 ", 'S', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ", 'T', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 ", 'W', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 ", 'Y', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 ", 'V', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 ", 'X', "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 ", ); my $i=0; for($i=0;$i