Field |
Bytes |
Length |
type |
OrfName |
4 + 11 |
15 |
Char |
OrfContig |
4 + 5 |
9 |
Char |
OrfLeft |
1 + 8 |
9 |
Num |
OrfRight |
1 + 8 |
9 |
Num |
OrfDirection |
4 + 1 |
5 |
Char |
OrfAccession |
4 + 14 |
18 |
Char |
OrfPct |
4 + 5 |
9 |
Char |
OrfEval |
1 + 8 |
9 |
Num |
OrfDescr |
4 + 50 |
54 |
Char |
TOTAL |
|
230 |
|
|
|
First Code |
|
Second Variable |
Second Code |
OrfName |
$name_length | V | $orf_length |
a11 |
|
OrfContig |
(skip) |
x4 | (skip) |
x5 |
|
OrfLeft |
(skip) | x1 |
$orf_left |
d |
|
OrfRight |
(skip) |
x1 | $orf_right |
d |
|
OrfDirection |
(skip) |
x4 | $orf_direction |
a1 |
|
OrfAccession |
(skip) |
x4 | (skip) |
x14 |
|
OrfPct |
(skip) |
x4 | (skip) |
x5 |
|
OrfEval |
(skip) |
x1 | (skip) |
x8 |
|
OrfDescr |
$descr_length |
V | $orf_descr |
a50 |
#!/usr/bin/perl -wTwo new things are the use of the binmode command, which lets Perl know that we intend to read binary data from the file, and the command read ORF_DATA, $buffer, $record_length. Binary files are usually organized into records, each record having exactly the same length. (By contrast, a text file may have lines of many different lengths.) The read command moves exactly $record_length bytes from the ORF_DATA f ile into $buffer. (Except at the very end of the file, if there are fewer than $record_length bytes left).
use strict;
dump_data("7120DB.DAT");
sub dump_data {
my ($orf_file) = @_;
my $record_length = 230;
open ORF_DATA, "<$orf_file" or die "Can't open $orf_file: $!\n";
binmode ORF_DATA; # Tell Perl this isn't a text file
my $buffer;
while (read ORF_DATA, $buffer, $record_length) {
# do something...
}
close ORF_DATA;
}
while (read ORF_DATA, $buffer, $record_length) {Unfortunately, when we run this the result is pretty forbidding:
my ($name_length, $orf_name, $orf_left, $orf_right,
$orf_direction, $descr_length, $orf_descr)
= unpack("Va11x4x5x1dx1dx4a1x4x14x4x5x1x8Va50", $buffer);
print "$name_length, $orf_name, $orf_left, $orf_right,
$orf_direction, $descr_length, $orf_descr\n";
}
22740, æS^@^@^Kall000, 1.46429023063736e-306, 2.35649294533797e-305,Some of the characters in the strings are unprintable. (You may see slightly different output, depending on how your computer tries to print the unprintable). The integers used for string length seem too big; the floating point numbers are tiny fractions.
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Kall000, 1.74842164944403e-305, 1.40140740775937e-304,
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Kasl000, 1.65915802715193e-306, 3.0355575411186e-304,
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Karl550, 5.92969249363755e-307, 6.82232954380406e-307,
S, 4294967295, S^@^@2ssrA: 10Sa RNA
4294967295, ES^@^@^Kall000, 2.94279990811978e-305, 1.90416083759353e-308,
S, 4294967295, S^@^@2AtpC: ATP synthase subunit gamma
...
3562536960, æS^@^@^Kall000, 1.46429023063736e-306, 2.35649294533797e-305,We need to back off a little bit. Our format instructions don't seem to be working, so we need something that will give us a more neutral view of the file. Here's another snippet of code to replace # do something...:
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Kall000, 1.74842164944403e-305, 1.40140740775937e-304,
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Kasl000, 1.65915802715193e-306, 3.0355575411186e-304,
S, 4294967295, S^@^@2unknown protein
4294967295, ES^@^@^Karl550, 5.92969249363755e-307, 6.82232954380406e-307,
S, 4294967295, S^@^@2ssrA: 10Sa RNA
4294967295, ES^@^@^Kall000, 2.94279990811978e-305, 1.90416083759353e-308,
S, 4294967295, S^@^@2AtpC: ATP synthase subunit gamma
...
foreach my $i (0 .. length($buffer) - 1) {This prints the most common printable characters (blanks, letters, numbers, and periods) as themselves. We print a pound sign (#) in place of the unprintable characters.
my $c = substr($buffer, $i ,1);
my $d;
if ($c =~ /[ a-zA-Z0-9.]/) { $d = $c }
else { $d = "#" }
substr($buffer, $i, 1) = $d;
}
print "$buffer\n";
#X###S###all0001 S###C N#sp#####N########S###cS###sp|Q06852|SLP1S### 50 N#Still pretty ugly, eh? But there are some regularities. We definitely see some of the strings -- name and description stand out, among others.
#######S##2unknown protein S###PM# #S###NPun64
7.032N##3#####N##7 ####N1#####BCS### N########N########N####