#!/usr/bin/perl -w
# Consensus:
# VERSION: Version 2 (6 October 2003)
# PURPOSE: Determine and print consensus sequence from an alignment
# of several sequences
# INPUT FILES: File composed of aligned sequences
# Format: Optional header [tab] sequence
# Each sequence must have the same number of nucleotides (or gaps)
# OUTPUT: To screen
# Prints each sequence from input file
# Prints consensus sequence as bottom line
# Consensus:
# Capital letter if single nucleotide occurs at frequency > $highThreshold
# Lower case letter if $lowThreshold < nucleotide freqy < $highThreshold
# Otherwise a dot
############## LIBRARIES AND PRAGMAS ################
use strict;
################ GLOBAL VARIABLES ###################
my @sequences; # Two dimensional array containing the set of sequences
# input sequences. The sequences are split into characters
# so $sequences[$a][$b] is the b'th character of the
# a'th sequence
my $sequence_length; # The length of each sequence.
# All sequences must be of the same length.
my @consensus; # One dimensional array containing the consensus sequence.
# The sequence is split into characters
my $highThreshold = 0.8; # Print consensus base in upper case if this fraction
# of sequences agree.
my $lowThreshold = 0.5; # Print consensus base in lower case if this fraction
# of sequences agree. (Must be at least 0.5, and
# less than $highThreshold.)
my @bases = ("G", "A", "T", "C");
my $LF = "\n"; # Line feed
###################### FILES ########################
my $sequence_file = "NtcA-binding.txt";
################### MAIN PROGRAM #####################
Get_sequences('NtcA-binding.txt');
Calculate_consensus_sequence();
Print_sequences();
Print_consensus();
#################### SUBROUTINES ####################
#### Get_sequences (name of file to be read)
# Opens and reads file
# Fills the @sequences array
# Each sequence is captured from the line,
# rendered upper case, split into characters,
# and stored as a row in @sequences.
sub Get_sequences {
my ($sequencePath) = @_;
my $line;
my $sequence;
open INPUT, "< $sequencePath" or die "Can't open $sequencePath: $!";
$line = ; # Read first line
while(defined $line) { # If still lines in file...
next if ($line =~ /^#/); # Skip if a comment line
next if ($line =~ /^\s*$/); # Skip if... (what's this?)
chomp $line; # Remove end-of-line
$line =~ /^.*\t(.*)$/ # Capture sequence
or die "Don't understand line $. of $sequencePath";
($sequence) = $1; # Line must be of this format
push @sequences, [split("", uc($sequence))];
$sequence_length = length($sequence);
$line = ; # Read next line
}
if (@sequences == 0) { # If size of @sequence is 0 (i.e. no sequences)
die "No sequences found in $sequencePath\n";
}
return @sequences;
}
#### Print_sequences
# Print @sequences, one sequence per line
#
sub Print_sequences {
my $row;
my $column;
for $row (0 .. Size(@sequences) - 1) {
for $column (0 .. $sequence_length - 1) {
print $sequences[$row][$column];
}
print $LF;
}
}
#### calculate_consensus_sequence
#
# For each column, set
#
# $consensus[$column] to "A" if at least $highThreshold fraction of bases
# in the column are "A"
# $consensus[$column] to "a" if the fraction of "A"'s in the column is
# at least $lowThreshold and less than
# $highThreshold.
# ... and similarly for T, G, and C.
#
# Set $consensus[$column] to "." if none of the bases meet those conditions.
sub Calculate_consensus_sequence {
my $column;
my $row;
my @count;
my $b;
for $column (0 .. $sequence_length - 1) {
@count = (0, 0, 0, 0);
for $row (0 .. Size(@sequences) - 1) {
for $b (0 .. Size(@bases) - 1) {
if ($sequences[$row][$column] eq $bases[$b]) {
$count[$b] = $count[$b] + 1;
}
}
}
$consensus[$column] = ".";
for $b (0 .. Size(@bases) - 1) {
if ($count[$b]/@sequences >= $highThreshold) {
$consensus[$column] = $bases[$b]
} elsif ($count[$b]/Size(@sequences) > $lowThreshold) {
$consensus[$column] = lc($bases[$b])
}
}
}
}
#### Print_consensus
# Prints consensus sequence column-by-column
sub Print_consensus {
my $column;
for $column (0 .. $sequence_length - 1) {
print $consensus[$column];
}
print $LF;
}
#### Size (array)
# Gives size of an array
# Just a synonym for the built-in function scalar
sub Size {return scalar(@_)}