#!/usr/bin/env perl use strict; use warnings; use File::Basename; # index is # of file processed # contents are 4 csv values from file for A C G T scores my @matA; my @matC; my @matG; my @matT; my $fileCount = 0; my $dirPath=`pwd`; chomp $dirPath; my $thisDir = basename($dirPath); open (FH, "ls *.txt|") or die "can not ls *.txt"; while (my $file = ) { chomp $file; my $lineCount = 0; open (TX, "egrep '^A |^C |^G |^T ' $file|") or die "can not read $file"; while (my $line = ) { chomp $line; my ($l, $A, $C, $G, $T) = split('\s+', $line); if ($l eq 'A') { $matA[$fileCount] = sprintf("%s,%s,%s,%s", $A, $C, $G, $T); } elsif ($l eq 'C') { $matC[$fileCount] = sprintf("%s,%s,%s,%s", $A, $C, $G, $T); } elsif ($l eq 'G') { $matG[$fileCount] = sprintf("%s,%s,%s,%s", $A, $C, $G, $T); } elsif ($l eq 'T') { $matT[$fileCount] = sprintf("%s,%s,%s,%s", $A, $C, $G, $T); } else { die "ERROR: what is this line from $file: '$line'"; } $lineCount += 1; } close (TX); next if ($lineCount == 0); die "did not scan four($lineCount) lines in $file" if (4 != $lineCount); $fileCount += 1; } close (FH); printf STDERR "# read %d .txt files\t%s\n", $fileCount, $thisDir; die "did not parse any files ?" if ($fileCount < 1); # index is 0 1 2 3 for the sum of the values for A C G T in that row my @aveA; my @aveC; my @aveG; my @aveT; my @minA; my @minC; my @minG; my @minT; my @maxA; my @maxC; my @maxG; my @maxT; for (my $i = 0; $i < 4; ++$i) { $minA[$i] = 1000000; $maxA[$i] = -1000000; $minC[$i] = 1000000; $maxC[$i] = -1000000; $minG[$i] = 1000000; $maxG[$i] = -1000000; $minT[$i] = 1000000; $maxT[$i] = -1000000; } for (my $i = 0; $i < $fileCount; ++$i) { my ($A, $C, $G, $T) = split(',', $matA[$i]); $aveA[0] += $A; $aveA[1] += $C; $aveA[2] += $G; $aveA[3] += $T; $minA[0] = $A if ($A < $minA[0]); $minA[1] = $C if ($C < $minA[1]); $minA[2] = $G if ($G < $minA[2]); $minA[3] = $T if ($T < $minA[3]); $maxA[0] = $A if ($A > $maxA[0]); $maxA[1] = $C if ($C > $maxA[1]); $maxA[2] = $G if ($G > $maxA[2]); $maxA[3] = $T if ($T > $maxA[3]); ($A, $C, $G, $T) = split(',', $matC[$i]); $aveC[0] += $A; $aveC[1] += $C; $aveC[2] += $G; $aveC[3] += $T; $minC[0] = $A if ($A < $minC[0]); $minC[1] = $C if ($C < $minC[1]); $minC[2] = $G if ($G < $minC[2]); $minC[3] = $T if ($T < $minC[3]); $maxC[0] = $A if ($A > $maxC[0]); $maxC[1] = $C if ($C > $maxC[1]); $maxC[2] = $G if ($G > $maxC[2]); $maxC[3] = $T if ($T > $maxC[3]); ($A, $C, $G, $T) = split(',', $matG[$i]); $aveG[0] += $A; $aveG[1] += $C; $aveG[2] += $G; $aveG[3] += $T; $minG[0] = $A if ($A < $minG[0]); $minG[1] = $C if ($C < $minG[1]); $minG[2] = $G if ($G < $minG[2]); $minG[3] = $T if ($T < $minG[3]); $maxG[0] = $A if ($A > $maxG[0]); $maxG[1] = $C if ($C > $maxG[1]); $maxG[2] = $G if ($G > $maxG[2]); $maxG[3] = $T if ($T > $maxG[3]); ($A, $C, $G, $T) = split(',', $matT[$i]); $aveT[0] += $A; $aveT[1] += $C; $aveT[2] += $G; $aveT[3] += $T; $minT[0] = $A if ($A < $minT[0]); $minT[1] = $C if ($C < $minT[1]); $minT[2] = $G if ($G < $minT[2]); $minT[3] = $T if ($T < $minT[3]); $maxT[0] = $A if ($A > $maxT[0]); $maxT[1] = $C if ($C > $maxT[1]); $maxT[2] = $G if ($G > $maxT[2]); $maxT[3] = $T if ($T > $maxT[3]); } for (my $i=0; $i < 4; ++$i) { $aveA[$i] = $aveA[$i]/$fileCount; if ($aveA[$i] < 0) {$aveA[$i] -= 0.5;} else {$aveA[$i] += 0.5;} $aveC[$i] = ($aveC[$i]/$fileCount); if ($aveC[$i] < 0) {$aveC[$i] -= 0.5;} else {$aveC[$i] += 0.5;} $aveG[$i] = ($aveG[$i]/$fileCount); if ($aveG[$i] < 0) {$aveG[$i] -= 0.5;} else {$aveG[$i] += 0.5;} $aveT[$i] = ($aveT[$i]/$fileCount); if ($aveT[$i] < 0) {$aveT[$i] -= 0.5;} else {$aveT[$i] += 0.5;} } printf "%7s%6s%6s%6s\taverages\t%d files\t%s\n", "A", "C", "G", "T", $fileCount, $thisDir; printf "A%6d%6d%6d%6d\n", $aveA[0], $aveA[1], $aveA[2], $aveA[3]; printf "C%6d%6d%6d%6d\n", $aveC[0], $aveC[1], $aveC[2], $aveC[3]; printf "G%6d%6d%6d%6d\n", $aveG[0], $aveG[1], $aveG[2], $aveG[3]; printf "T%6d%6d%6d%6d\n", $aveT[0], $aveT[1], $aveT[2], $aveT[3]; printf "%7s%6s%6s%6s\tranges\t%d files\t%s\n", "A", "C", "G", "T", $fileCount, $thisDir; printf "A"; for (my $i=0; $i < 4; ++$i) { printf "%6d", $maxA[$i] - $minA[$i]; } printf "\nC"; for (my $i=0; $i < 4; ++$i) { printf "%6d", $maxC[$i] - $minC[$i]; } printf "\nG"; for (my $i=0; $i < 4; ++$i) { printf "%6d", $maxG[$i] - $minG[$i]; } printf "\nT"; for (my $i=0; $i < 4; ++$i) { printf "%6d", $maxT[$i] - $minT[$i]; } printf "\n"; my $delta = 0; printf "%7s%6s%6s%6s\tranges percent\t%d files\t%s\n", "A", "C", "G", "T", $fileCount, $thisDir; printf "A"; for (my $i=0; $i < 4; ++$i) { if ($aveA[$i] < 0) { $delta = -100*($maxA[$i] - $minA[$i])/$aveA[$i]; } else { $delta = 100*($maxA[$i] - $minA[$i])/$aveA[$i]; } printf "%6.1f", $delta } printf "\nC"; for (my $i=0; $i < 4; ++$i) { if ($aveC[$i] < 0) { $delta = -100*($maxC[$i] - $minC[$i])/$aveC[$i]; } else { $delta = 100*($maxC[$i] - $minC[$i])/$aveC[$i]; } printf "%6.1f", $delta } printf "\nG"; for (my $i=0; $i < 4; ++$i) { if ($aveG[$i] < 0) { $delta = -100*($maxG[$i] - $minG[$i])/$aveG[$i]; } else { $delta = 100*($maxG[$i] - $minG[$i])/$aveC[$i]; } printf "%6.1f", $delta } printf "\nT"; for (my $i=0; $i < 4; ++$i) { if ($aveT[$i] < 0) { $delta = -100*($maxT[$i] - $minT[$i])/$aveT[$i]; } else { $delta = 100*($maxT[$i] - $minT[$i])/$aveT[$i]; } printf "%6.1f", $delta } printf "\n"; __END__ A C G T A 83 -217 -131 -233 C -217 100 -191 -131 G -131 -191 100 -217 T -233 -131 -217 83