#!/usr/bin/perl -w use strict; ################# Parameters ######################## my $B = 0.1; # Weighting factor for pseudocounts my $threshold = -2**30; # Save motifs with scores greater than $threshold # Set initially to a very low number. # Changes as hits are found. my $info_threshold = 1.5; # Column must have at least this information content # to be used in comparisons my $number_to_save = 300; # number of top scores and coordinates to save my %bg_frequency = # nucleotide frequencies of overall sequences ("A" => 0.33264, "T" => 0.33264, "G" => 0.16736, "C" => 0.16736); ###################### FILES ######################## # motif_input: input file of aligned motifs, produced by Meme # sequence_file: input file containing sequences in FA format # motif_hits: output file of best motifs within sequence file # Each line is name of sequence, score, start_coord, sequence my $motif_input = '71NpNtsm.txt'; my $sequence_file = 'NostocChromosome.nt'; my $motif_hits = 'motif.hits'; ############### GLOBAL MOTIF VARIABLES ################ my $motif_length; # length of motifs my @informational; # holds those positions in the motif that contribute # to the motif -- i.e. those which are not # don't-care positions. my %n; # number of occurrences of nucleotides at each # position my %log_q; # $log_q{$nucleotide}[$position] is the log of the # frequency of the nucleotide at that position my %log_p; # $log_p{$nucleotide} is the background frequency # of the nucleotide my @motif_sequences; # holds sequences in training set my $N; # holds number of sequences in training set my $stars; # holds prior information on informational positions ################### MAIN PROGRAM #################### print "Information content threshold: $info_threshold\n"; print "Saving top $number_to_save hits\n"; # Set the global motif variables $motif_length, @informational, # %log_q, and %log_p get_motif(); calc_motif_frequency(); calc_background_frequency(); if ($stars) { record_information_content() } else { calc_information_content() } my $good_positions = @informational; print "$good_positions out of $motif_length positions are informational\n"; # Find and save hits open_sequence_file(); while (my($seq_name, $sequence) = get_new_sequence()) { analyze_sequence(shorten($seq_name), $sequence); } print_hits(); #################### SUBROUTINES #################### sub shorten { my ($long) = @_; # Return just the first word of $long my ($short) = $long =~ /(\w+)/; return $short; } ################# MOTIF SUBROUTINES ################# sub get_motif { # Return array of aligned sequences from output of Meme # Set the global variables $motif_length and @informational # Extract informational line and motif sequences # open MOTIF_INPUT, "<$motif_input" or die "Can't open $motif_input: $!\n"; while (<MOTIF_INPUT>) { if (/\b([ATCG]+)\b/i) { push @motif_sequences, uc $1 } elsif (/^\s+([\s\*\.]*\*)$/) { $stars = $1 } } close MOTIF_INPUT; # Complain and die unless we found some motifs and they are all the # same length # @motif_sequences > 0 or die "No motif sequences found in $motif_input\n"; $motif_length = length($motif_sequences[0]); foreach my $motif (@motif_sequences) { length($motif) == $motif_length or die "Length of motif $motif in $motif_input is not $motif_length\n"; } } sub calc_motif_frequency { # Accumulate nucleotide totals at each position # Calculate the adjusted frequency at each position. # # The actual frequency is # # n{nucleotide}[position] / N , # where: # n{nucleotide}[position] is the number of times the nucleotide appears # at the position # N is the number of motifs found in the training session, # # The adjusted formula is a little more complicated: # # n{nucleotide}[position] + B * bg_pct[nucleotide] # q{nucleotide}[position] = ------------------------------------------------ # N + B # where n and N are as before, and: # q{nucleotide}[position] is the adjusted frequency of the nucleotide # at the position # B is the weighting constant for pseudocounts # bg_pct[nucleotide] is the fraction that the nucleotide appears in total # DNA # # We actually store $log_q{$nucleotide}[position], the log of the adjusted # frequency, rather than the frequency itself. # Calculate %n # foreach my $nucleotide ("A", "C", "G", "T") { foreach my $position (0..$motif_length) { $n{$nucleotide}[$position] = 0; } } foreach my $motif (@motif_sequences) { foreach my $position (0..$motif_length-1) { my $nucleotide = substr($motif, $position, 1); $n{$nucleotide}[$position] = $n{$nucleotide}[$position] + 1; } } # Calculate %log_q # $N = @motif_sequences; foreach my $nucleotide ("A", "C", "G", "T") { foreach my $position (0..$motif_length) { $log_q{$nucleotide}[$position] = log( ($n{$nucleotide}[$position] + $B * $bg_frequency{$nucleotide}) / ($N + $B) ); } } } sub calc_background_frequency { # Calculate background percentage (for pseudocounts) foreach my $nucleotide ("A", "C", "G", "T") { $log_p{$nucleotide} = log($bg_frequency{$nucleotide}); } } ######## INFORMATION CONTENT ROUTINES ################# # # Here are two routines to initialize @informational, which tells # the rest of the program which positions (columns) are worth bothering # about. # sub record_information_content { # Informational positions were specified in input file by a line of # asterisks ($stars), so initialize @informational from that. foreach my $position (0..$motif_length-1) { push @informational, $position if substr($stars, $position, 1) eq "*"; } } sub calc_information_content { # Determine information content for each column and on that basis # initialize @informational foreach my $position (0..$motif_length-1) { my $info = 0; foreach my $nucleotide ("A", "C", "G", "T") { my $fraction = $n{$nucleotide}[$position]/$N; if ($fraction > 0) { my $log2fraction = - log($fraction) / log(2); $info = $info + $fraction * $log2fraction; } } push @informational, $position if $info < $info_threshold; } } ################# SCORING SUBROUTINES ################# sub analyze_sequence { my ($name, $sequence) = @_; my $region_start; my $subregion; foreach $region_start (0 .. length($sequence) - $motif_length) { if ($region_start % 1000 == 0) {print $region_start/1000,"\t"} $subregion = substr($sequence, $region_start, $motif_length); my $score = score_of_region($subregion); if ($score > $threshold) { store_hit_information($name, $score, $region_start, $subregion) } } } sub score_of_region { my ($region) = @_; # The score is Sum [log_q - log_p] at each position # This means that the closer the sequence is to the set that # produced q, the higher the score my $score = 0; foreach my $position (@informational) { my $nucleotide = substr($region, $position, 1); if ($nucleotide eq ".") {return $threshold - 1} $score += $log_q{$nucleotide}[$position] - $log_p{$nucleotide}; } return $score } my @hit_info; # Information about each high-scoring sequence sub store_hit_information { # Store the hit information. If we've accumulated too many hits, # discard all but the highest-scoring ones. my ($name, $score, $start, $subregion) = @_; push @hit_info, [$score, $name, $start, $subregion]; if (@hit_info > $number_to_save * 7) { sort_and_discard_excess_values(); my $best_score = $hit_info[0][0]; my $last_saved_score = $hit_info[$number_to_save - 1][0]; printf "\nRange: %1.1f to %1.1f\t\t", $last_saved_score, $best_score; } } sub sort_and_discard_excess_values { @hit_info = reverse sort { $$a[0] <=> $$b[0] } @hit_info; while (@hit_info > $number_to_save) { pop @hit_info } } sub print_hits { # Print the highest scoring hits. sort_and_discard_excess_values(); open HITS, ">$motif_hits" or die "Can't open $motif_hits: $!\n"; foreach my $info (@hit_info) { my ($score, $name, $start, $subregion) = @$info; printf HITS "%9s %6.1f %8d %s\n", $name, $score, $start, $subregion; } close HITS } ############# SEQUENCE INPUT SUBROUTINES ############# my $header_line; # The header line of the next sequence (or undefined if # there are no more sequences in $sequence_file sub open_sequence_file { # Open the sequence file, and initialize $header_line to (what should be) # the header line of the first sequence # open SEQUENCE_INPUT, "<$sequence_file" or die "Can't open $sequence_file: $!\n"; $header_line = <SEQUENCE_INPUT>; } sub get_new_sequence { # Return the next sequence in the input file, with its name. # The input is in FastA format, and in general will contain many # sequences. defined $header_line or return (); my ($name) = $header_line =~ /^>\s*(.*)/ or die "Expected a sequence name at line $. of the sequence file\n"; my @parts; while (<SEQUENCE_INPUT>) { last if /^>/; # Header line of next sequence, so stop reading s/\s//g; push @parts, uc $_; } $header_line = $_; # Save header line (or undef) for next time return ($name, join("", @parts)); }