#!/usr/bin/perl -w # Consensus: # VERSION: Version 2 (6 October 2003) # PURPOSE: Determine and print consensus sequence from an alignment # of several sequences # INPUT FILES: File composed of aligned sequences # Format: Optional header [tab] sequence # Each sequence must have the same number of nucleotides (or gaps) # OUTPUT: To screen # Prints each sequence from input file # Prints consensus sequence as bottom line # Consensus: # Capital letter if single nucleotide occurs at frequency > $highThreshold # Lower case letter if $lowThreshold < nucleotide freqy < $highThreshold # Otherwise a dot ############## LIBRARIES AND PRAGMAS ################ use strict; ################ GLOBAL VARIABLES ################### my @sequences; # Two dimensional array containing the set of sequences # input sequences. The sequences are split into characters # so $sequences[$a][$b] is the b'th character of the # a'th sequence my $sequence_length; # The length of each sequence. # All sequences must be of the same length. my @consensus; # One dimensional array containing the consensus sequence. # The sequence is split into characters my $highThreshold = 0.8; # Print consensus base in upper case if this fraction # of sequences agree. my $lowThreshold = 0.5; # Print consensus base in lower case if this fraction # of sequences agree. (Must be at least 0.5, and # less than $highThreshold.) my @bases = ("G", "A", "T", "C"); my $LF = "\n"; # Line feed ###################### FILES ######################## my $sequence_file = "NtcA-binding.txt"; ################### MAIN PROGRAM ##################### Get_sequences('NtcA-binding.txt'); Calculate_consensus_sequence(); Print_sequences(); Print_consensus(); #################### SUBROUTINES #################### #### Get_sequences (name of file to be read) # Opens and reads file # Fills the @sequences array # Each sequence is captured from the line, # rendered upper case, split into characters, # and stored as a row in @sequences. sub Get_sequences { my ($sequencePath) = @_; my $line; my $sequence; open INPUT, "< $sequencePath" or die "Can't open $sequencePath: $!"; $line = ; # Read first line while(defined $line) { # If still lines in file... next if ($line =~ /^#/); # Skip if a comment line next if ($line =~ /^\s*$/); # Skip if... (what's this?) chomp $line; # Remove end-of-line $line =~ /^.*\t(.*)$/ # Capture sequence or die "Don't understand line $. of $sequencePath"; ($sequence) = $1; # Line must be of this format push @sequences, [split("", uc($sequence))]; $sequence_length = length($sequence); $line = ; # Read next line } if (@sequences == 0) { # If size of @sequence is 0 (i.e. no sequences) die "No sequences found in $sequencePath\n"; } return @sequences; } #### Print_sequences # Print @sequences, one sequence per line # sub Print_sequences { my $row; my $column; for $row (0 .. Size(@sequences) - 1) { for $column (0 .. $sequence_length - 1) { print $sequences[$row][$column]; } print $LF; } } #### calculate_consensus_sequence # # For each column, set # # $consensus[$column] to "A" if at least $highThreshold fraction of bases # in the column are "A" # $consensus[$column] to "a" if the fraction of "A"'s in the column is # at least $lowThreshold and less than # $highThreshold. # ... and similarly for T, G, and C. # # Set $consensus[$column] to "." if none of the bases meet those conditions. sub Calculate_consensus_sequence { my $column; my $row; my @count; my $b; for $column (0 .. $sequence_length - 1) { @count = (0, 0, 0, 0); for $row (0 .. Size(@sequences) - 1) { for $b (0 .. Size(@bases) - 1) { if ($sequences[$row][$column] eq $bases[$b]) { $count[$b] = $count[$b] + 1; } } } $consensus[$column] = "."; for $b (0 .. Size(@bases) - 1) { if ($count[$b]/@sequences >= $highThreshold) { $consensus[$column] = $bases[$b] } elsif ($count[$b]/Size(@sequences) > $lowThreshold) { $consensus[$column] = lc($bases[$b]) } } } } #### Print_consensus # Prints consensus sequence column-by-column sub Print_consensus { my $column; for $column (0 .. $sequence_length - 1) { print $consensus[$column]; } print $LF; } #### Size (array) # Gives size of an array # Just a synonym for the built-in function scalar sub Size {return scalar(@_)}