#!/usr/bin/perl -w # FindMatches # VERSION: 2 (18 September 2004) # PURPOSE: Find matches of pattern in genomic sequence # Reports exact matches and matches off by one # INPUT FILES: Sequence in FastA format # OUTPUT FILES: None. Output to monitor. # # CHANGES: Version 2 uses warnings ############## LIBRARIES AND PRAGMAS ################ use strict; use warnings; #################### CONSTANTS ###################### my $true = 1; my $false = 0; my $LF = "\n"; # Line feed my $pattern = "AAAAAAAA"; # Sequence to be found in input file #################### VARIABLES ###################### my $header; # Header of FastA-formatted input file my $sequence; # Sequence of input file my @exact_matches; # Stores start and stop coordinates of exact # matches of pattern in sequence my @inexact_matches; # Stores start and stop coordinates of matches # off by no more than one mismatch ###################### FILES ######################## my $sequence_file = 'AnabSeq.nt'; ################### MAIN PROGRAM #################### # Get header and sequence from FastA-formatted file ($header, $sequence) = get_genome_sequence($sequence_file); # Find and print out all the exact matches @exact_matches = match_positions($pattern, $sequence); if (@exact_matches) { print "Exact matches:", $LF; print_matches(@exact_matches); } # Now do the same, but allow one mismatch @inexact_matches = match_positions(One_mismatch($pattern), $sequence); if (@inexact_matches) { print "Matches with one possible mismatch", $LF; print_matches(@inexact_matches); } #################### SUBROUTINES #################### #### GET_GENOME_SEQUENCE (file_path) # Extracts header and sequence from given sequence file in FastA format # Header has format >header information sub get_genome_sequence { my ($file_path) = @_; my $header; # header of FastA-formatted file my $sequence; # DNA sequence of file my $line; # One line of input from file open GENOME, "<$file_path" or die "Can't open $file_path: $!\n"; $line = ; # Get first line (containing header) ($header) = ($line =~ /^>(.*)/); $line = ; while (defined $line) { chomp $line; # Remove line break $line = uc $line; # Make the line uppercase; $sequence = $sequence . $line; # Append the line to the sequence # last if length($sequence) > 60000; $line = ; # Read next line } return ($header, $sequence); } #### MATCH_POSITIONS (pattern, sequence) # Finds all matches of pattern within seqeunce # Searches direct strand and then other strand # Returns start and end points of each match in an array, followed by # "D" if found on direct strand or "R" if found on reverse strand # Results are sorted by start points # $& is a built-in Perl variable holding the string matched by the last # match operation # sprintf "%7d" formats the start and end points to pad the high order end # with blanks. This allows them to be sorted in proper numerical order # even though they're stored as character strings. # pos() is a built-in Perl function returning the position after the end of # the last match # Recall that string coordinates start with 0, not 1 sub match_positions { my $pattern; # Given pattern my $sequence; # Given sequence. Serves for both strands. my $match_start; # Starting coordinate of match my $match_end; # Ending coordinate of match my @results; # start coord..end coord for each match ($pattern, $sequence) = @_; # Extract parameters from list # Find matches in direct strand while ($sequence =~ /$pattern/g) { my $match_start = sprintf "%7d", pos($sequence) - length($&) + 1; my $match_end = sprintf "%7d", pos($sequence); push @results, "$match_start .. $match_end D"; } # Find matches in reverse strand $sequence = reverse(Complement($sequence)); while ($sequence =~ /$pattern/g) { my $match_start = sprintf "%7d", length($sequence) - (pos($sequence) - length($&) + 1) + 1; my $match_end = sprintf "%7d", length($sequence) - pos($sequence) + 1; push @results, "$match_start .. $match_end R"; } return sort(@results); } #### ONE_MISMATCH (pattern) # Makes a complex pattern in which the original pattern is replaced by one # that permits a mismatch at any one position # Strategy is to go through the original pattern one character at a time, # replacing each A, C, G, or T with "." to match anything # Each time a letter is found, the string (with the letter replaced by ".") # is joined to the growing new pattern # The new pattern produced will be: # (subpattern|subpattern|subpattern|...|subpattern) # A match is therefore produced if ANY of the subpatterns are matched. # This algorithm will fail for certain complicated patterns where A, C, G, or T # appear in strange circumstances sub One_mismatch { my ($original_pattern) = @_; # Extract original pattern from parameter list my $new_pattern = "("; # The new pattern begins "(" my $subpattern; # Holds a pattern with one letter replaced my $character; # One character from original pattern my $position; # Position within pattern from 0 to end of pattern for $position (0 .. length($original_pattern) - 1) { $character = substr($original_pattern, $position, 1); if ($character eq ".") {$character = '\.'}; # So Perl interprets . as period if ("ACGT" =~ /$character/) { # If character is a nucleotide... $subpattern = $original_pattern; substr($subpattern, $position, 1) = "."; # Replace character with "." if ($position > 0) {$new_pattern .= "|"}; # Put "|" (or) before every # subpattern except first $new_pattern .= $subpattern; # Add (.=) subpattern to growing # new pattern } } $new_pattern .= ")"; # Add final ")" to end return $new_pattern; } #### COMPLEMENT (sequence) # Returns complement of sequence (n.b. NOT the reverse complement) # Task of producing complementary strand in 5'->3' orientation completed # outside of this function by Perl's reverse sub Complement { my ($sequence) = @_; $sequence =~ tr/ACGTRYNX/TGCAYRNX/; return $sequence; } #### PRINT_MATCHES # Format array for printing # Puts each element on separate line sub print_matches { my (@matches) = @_; foreach my $match (@matches) { print $match, $LF; } }