#!/usr/bin/perl -w # ThreadProtein # VERSION: Version 2.5 (17 Nov 2003) # PURPOSE: Threads the amino acid sequence of one protein # through the three-dimensional structure of another # # INPUT FILES: # Structure file: PDB format. Must have only one chain, corresponding # to known sequence. # # Alignment file: FastA format (output as PIR format by Clustal) # Leading and lagging blanks are OK # SEQUENCES MUST BE ALIGNED! (i.e. with gaps if necessary) # First sequence must be that also represented by structure file # = Known sequence # Second sequence must be that to be superimposed on known structure # = threaded sequence # # > [header line for sequence with known structure] # [optional blank line] # [amino acid sequence, with gaps (-) for alignment] # * [optional] # > [header line for sequence to be threaded] # [optional blank line] # [amino acid sequence, with gaps (-) for alignment] # * [optional] # # OUTPUT FILES: Description of format of output files # New structure file: PDB format # # NO CHANGE: Amino acids common between the two sequences go in chain A # REPLACEMENTS: Amino acids changed in threaded sequence go in chain B # Amino acid called new name in first atom of residue (others unchanged) # INSERTIONS: Amino acids in threaded sequence but not original go in chain C # No attempt made to find positions of the new atoms. # Only first atom given # Inserted atom given meaningless number # DELETIONS: Amino acids in original but not in threaded sequence go in chain D # Informational lines (e.g. Title, source, etc) copied unchanged # therefore information may no longer be appropriate # # DIFFERS FROM Version 1: # Uses AA_module and FastA_module ############## LIBRARIES AND PRAGMAS ################ use strict; use AA_module; # Interconversion of amino acid name formats use FastA_module; # Reads FastA-formatted files #################### CONSTANTS ###################### my $true = 1; my $false = 0; my $LF = "\n"; # Line feed my $tab = "\t"; my $gap = "-"; # Form of gap used in sequence alignment #################### VARIABLES ###################### my $known_seq; # Sequence with known structure my $known_seq_header; # Header of that sequence my $threaded_seq; # Sequence to be superimposed on known structure my $threaded_seq_header; # Header of that sequence my $comparison_summary; # Distillation of alignment of the two sequences: # M = match R = replacement I = insertion D = deletion # (e.g. D indicates an amino acid in known sequence but # not threaded sequence) my %atom_lines; # Holds atom coordinate lines of PDB format for all chains except chain A ###################### FILES ######################## my $alignment_file = "seq.pir"; my $structure_file = "1DLI.pdb"; my $output_file = "Mesorhizobium_UDPglcDH4.pdb"; ################### MAIN PROGRAM #################### # STRATEGY # Read both sequences # Compare sequences to determine which positions are same, which different, and where gaps # (information stored in $comparison_summary) # Copy PDB file, line for line... # ...except for ATOM lines (each contains coordinates of an atom) # Separate replacements, insertions, and deletions into different mythical chains # Amino acids that are replaced are torn out of chain A and their replacements put in chain B # Amino acids that are inserted are put in chain C # Amino acids that are deleted are put in chain D Read_FastA_sequences ($alignment_file); ($known_seq_header, $known_seq) = Get_sequence_info(1); ($threaded_seq_header, $threaded_seq) = Get_sequence_info(2); $comparison_summary = Analyze_alignment($known_seq, $threaded_seq); Create_new_structure_file ( ); print "Threading completed"; #################### SUBROUTINES #################### ##### ANALYZE_ALIGNMENT (known sequence, threaded sequence) # Analyze alignment at each position # Put I (insertion) where gap in sequence with known structure # Put D (delection) where gap in sequence to be threaded # Put M (match) where both sequences match # Put R (replacement) where sequences differ sub Analyze_alignment { my ($known_seq, $threaded_seq) = @_; my $comparison = ""; # Holds information for each position of the # sequence comparison: same? insertion? etc. my $pos; # Current position of sequences under consideration if (not length($known_seq) == length($threaded_seq)) { die "The two sequences must be the same length! $LF" . "(You may have forgotten to align them)"} if (not substr($known_seq, -1, 1) eq "*") {$known_seq .= "*"} if (not substr($threaded_seq, -1, 1) eq "*") {$threaded_seq .= "*"} foreach $pos (0 .. length($known_seq) - 1) { if ( substr($known_seq, $pos, 1) eq $gap ) {$comparison .= "I"} elsif ( substr($threaded_seq, $pos, 1 ) eq $gap) {$comparison .= "D"} elsif ( not substr($known_seq, $pos, 1) eq substr($threaded_seq, $pos, 1) ) {$comparison .= "R"} else {$comparison .= "M"} } return $comparison; } ##### CREATE_NEW_STRUCTURE_FILE # Copy pdb file and modify the ATOM records according to the # alignment summary made in Analyze_alignment sub Create_new_structure_file { my $line; # Current line from input PDB file my $newline; # Line under construction for chains C and D my $previous_line; # Needed for special case when handling TER line my $residue; # Number of current amino acid in known sequence my $last_residue = True_size($known_seq); # Number of last amino acid in known sequence my $new_residue; # Amino acid from threaded sequence my $residue_status; # Match, replacement, etc my $linetype; # Identifying tag at the beginning of each line of PDB file my $gaps_reached_in_known_seq = 0; # Needed to translate position in gapped alignment to # position in amino acid sequence my $terminus_found = $false; # True when TER line encountered my $chains_printed = $false; # True when all the chains have been printed open OLD_STRUCTURE, "<$structure_file" or die "Can't open $structure_file: $!\n"; open OUTPUT, ">$output_file" or die "Can't open $output_file: $!\n"; $line = ; while (defined $line) { chomp $line; $linetype = substr($line, 0, 4); $newline = undef; if ($linetype eq "ATOM" or $linetype eq "TER ") { # Change only ATOM lines $residue = substr($line, 22, 4) - 1 + $gaps_reached_in_known_seq; # Perl residue starts with 0 if ($linetype eq "TER ") { # PDB file numbers terminal residue $residue = $residue + 1; # same as last residue. Fix this. $terminus_found = $true} $residue_status = substr($comparison_summary, $residue, 1); if ($residue_status eq "M") { # Matched line is already OK print OUTPUT $line, $LF; # Output original ATOM line $previous_line = $line; # Save $line for insertions after end of seq $line = ; # Read next line } elsif ($residue_status eq "R") { # Replacements go in chain B $new_residue = substr($threaded_seq, $residue, 1); $line = Update_residue_name($line, $new_residue); $line = Update_chain_ID($line, "B"); push @{$atom_lines{"B"}}, $line; $previous_line = $line; # Save $line for insertions after end of seq $line = ; # Read next line } elsif (uc($residue_status) eq "D") { # Deletions go in chain D $newline = Update_chain_ID($line, "D"); push @{$atom_lines{"D"}}, $newline; $previous_line = $newline; # Save $line for insertions after end of seq $line = ; # Read next line } elsif ($residue_status eq "I") { # Insertions go in chain C $last_residue = $last_residue + 1; $new_residue = substr($threaded_seq, $residue, 1); if ($linetype eq "TER ") { # PDB file numbers terminal residue $newline = $previous_line} # Restore coords of previous residue else {$newline = $line} $newline = Update_residue_name($newline, $new_residue); $newline = Update_chain_ID($newline, "C"); $newline = Update_residue_number($newline, $last_residue); $gaps_reached_in_known_seq = $gaps_reached_in_known_seq + 1; push @{$atom_lines{"C"}}, $newline; } else { # This should never happen print "ERROR in:", $LF, $line, $LF, " Residue: $residue", $LF, " Residue status: $residue_status", $LF; $line = ; # Read next line } } else { # All Non-ATOM/TER lines if ($terminus_found and not $chains_printed) { Output_extra_chains ( ); # Output chains B-D after A is finished $chains_printed = $true} print OUTPUT $line, $LF; # Output original line $line = ; # Read next line } } close OLD_STRUCTURE; close OUTPUT; } ##### OUTPUT_EXTRA_CHAINS # Outputs extra chains (beyond A) sub Output_extra_chains { foreach my $chain (sort (keys %atom_lines)) { foreach my $atom ( 0 .. Size(@{$atom_lines{$chain}}) - 1 ) { print OUTPUT $atom_lines{$chain}[$atom], $LF; } } } ##### TRUE_SIZE (sequence with gaps) # Gives the length of a sequence, excluding gap characters sub True_size { my ($seq) = @_; my $number_of_gaps = ($seq =~ s/$gap/$gap/g); return length($seq) - $number_of_gaps; } ##### UPDATE_CHAIN_ID (line, chain ID) # Updates the chain ID for the given ATOM record. # ATOM record is fixed width column, chain ID is the 22nd character # Input: # $line: text of line to be changed # chainID$: new chainID to be written to the record (e.g. A, B, C, D) sub Update_chain_ID { my ($line, $chainID) = @_; if (not length($chainID) == 1) { # Chain ID should ALWAYS be length 1 die "ERROR: Chain ID ($chainID) not of length 1"; } substr($line, 21, 1) = $chainID; return $line; } ##### UPDATE_RESIDUE_NAME (line, residue name) # Updates the residue name for the given ATOM record. # ATOM record is fixed width column, chain ID is the 18th to 20th character # Input: # $line: text of line to be changed # $residue_name: new residue name to be written to the record (3-letter aa code) sub Update_residue_name { my ($line, $residue_name) = @_; if ($residue_name eq "INS" or $residue_name eq "DEL") { substr($line, 17, 3) = $residue_name; } else { substr($line, 17, 3) = uc( $aa_3_letter_code{$residue_name} ); } return $line; } ##### UPDATE_RESIDUE_NUMBER (line, residue number) # Updates the residue number for the given ATOM record. # ATOM record is fixed width column, chain ID is the 23rd to 26th character # Input: # $line: text of line to be changed # $residue_number: new number (greater than length of sequence) sub Update_residue_number { my ($line, $residue_number) = @_; substr($line, 22, 4) = sprintf ("%4d", $residue_number); return $line; } ##### SIZE (array) # Returns size of array # Synonym for scalar (array) sub Size {return scalar(@_)}