#!/usr/bin/perl -w
use strict;

################################ Variables ###############################

my @orfs;     # location, name, and direction of ORFs
              # Each element is a triple [$left, $right, $info]
              # where $left and $right are the starting and ending points
              # of the ORF, and info gives the name, discription, and
              # direction of the ORF.

my @sites;    # location of putative binding sites
              # Each element is a pair [$left, $right] giving the starting
              # and ending points of the possible binding site.

############################# Main Program ###############################

read_orfs("7120DB.DAT");
#print_orfs();

read_sites("motif.hits");
@sites = sort { $$a[0] <=> $$b[0] } @sites;
#print_sites();

print_context("site.context");

############################# Input Subroutines ##########################


sub read_orfs {
   my ($orf_file) = @_;
   my $record_length = 230;
   open ORF_DATA, "<$orf_file" or die "Can't open $orf_file: $!\n"; 
   binmode ORF_DATA;  # Tell Perl this isn't a text file
   my $buffer;
   my $packing = "x4A11" # orf_name
              .  "x8"    # orf_contig
              .  "x1a8"  # orf_left
              .  "x1a8"  # orf_right
              .  "x4A1"  # orf_direction
              .  "x18"   # orf_accession
              .  "x9"    # orf_pct
              .  "x9"    # orf_eval
              .  "x4A50" # orf_descr
              ;

   read ORF_DATA, $buffer, 5;  # Skip first five bytes of the file.

   my $last_left = -1e100;
   while (read ORF_DATA, $buffer, $record_length) {
      length($buffer) == $record_length or die "Short record\n";
      my ($orf_name, $orf_left, $orf_right, $orf_direction, $orf_descr)
         = unpack($packing, $buffer);
      foreach my $coordinate ($orf_left, $orf_right) {
         $coordinate = unpack("d", reverse $coordinate);
      }
      last if $last_left > $orf_left;
      $last_left = $orf_left;
      if ($orf_direction eq "d") { $orf_direction = "-->" }
      elsif ($orf_direction eq "c") { $orf_direction = "<--" }
      else { $orf_direction = "???" }
      push @orfs,
         [$orf_left, $orf_right, "$orf_direction\t$orf_name\t$orf_descr"];
   }
   close ORF_DATA;
}

sub print_orfs {
   foreach my $orf (@orfs) {
      print "@$orf\n";
   }
}

sub read_sites {
   my ($site_file) = @_;
   open BIND, "<$site_file" or die "Can't open $site_file: $!\n"; 
   while (<BIND>) {
      my ($start, $sequence) = /^\s*\S+\s+\S+\s+(\d+)\s+(\S+)\s*$/
         or die "Can't decode line $. of $site_file\n";
      my $end = $start + length($sequence) - 1;
      push @sites, [$start, $end];
   }
   close BIND;
}

sub print_sites {
   foreach my $site (@sites) {
      print "@$site\n";
   }
}

########################### Printing Subroutines #########################

sub print_context {
   my ($context_file) = @_;
   open CONTEXT, ">$context_file" or die "Can't open $context_file:$!\n";
   foreach my $site (@sites) {
      my $o = 0;
      while ($o < @orfs and $orfs[$o][1] < $$site[0]) {
         # Site is to the right of $orfs[$o]
         $o = $o + 1;
      }
      if ($o == @orfs) { # site is to the right of every ORF
         die "Can't yet print context without right-hand ORF\n";
      }
      elsif ($$site[1] < $orfs[$o][0]) { # Site is to the left of $orfs[$o]
         if ($o == 0) { die "Can't yet print context without left-hand ORF\n" }
         else { print_pair($site, $orfs[$o-1], $orfs[$o]) }
      }
      else { # $site overlaps $orfs[$o]
         while ($o < @orfs and $$site[1] >= $orfs[$o][0]) {
            print_overlap($site, $orfs[$o]);
            $o = $o + 1;
         }
      }
   }
   close CONTEXT;
}

sub print_pair {
   my ($site, $left, $right) = @_;
   my ($left_info, $right_info) = ($$left[2], $$right[2]);
   print_line($left_info, $$site[0] - $$left[1],
              $site,
              $$right[0] - $$site[1], $right_info);
}

sub print_overlap {
   my ($site, $orf) = @_;
   my $info = $$orf[2];
   print_line($info, $$site[0] - $$orf[0],
              $site,
              $$orf[1] - $$site[1], "", "");
}

sub print_line {
   my ($a, $b, $site, $c, $d) = @_;
   my $id = "$$site[0]-$$site[1]";
   print CONTEXT "$a\t$b\t$id\t$c\t$d\n";
}