BedBlastLift

From genomewiki
Revision as of 23:24, 9 September 2006 by Max (talk | contribs)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigationJump to search

The big guru has spoken some time ago and he said that tblastx is a very sensitive alignment tool, more than blastn or blat. You cannot run it on the whole genome, but your might give it a try on smaller sequences. However, blastToPsl cannot parse tblastx-output, as tblastx does not return real alignments (not sure here, but there seem to exist variable sized gaps "*" in the output), and without psl, not chain, no chain -> no liftover.

We can still simply take everything that overlaps an alignable region and map it to the WHOLE alignable region on the second sequence so as to give at least a hint where it should be roughly located. This is what the following scripts does. It needs the bed file from sequence1 and the tblastx output with option "-m 9" (tabular format) as input.

#!/usr/bin/python

from sys import *
from optparse import OptionParser

# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = OptionParser("%prog [options] bedfile blastfile: Use a tblastx-tabular-file to lift annotations between sequences (approximatively), very slow") 

#parser.add_option("-l", "--wordlength", dest="motiflength", action="store", help="length of word [default: %default, hexamers]", type="int", metavar="NUMBER", default="6") 
#parser.add_option("-m", "--maf", dest="maf", action="store_true", help="force maf format [default: %default]", default=True) 
#parser.add_option("-f", "--fasta", dest="maf", action="store_false", help="force fasta format (if not specified, we assume UCSC .maf file format") 
(options, args) = parser.parse_args()

# ==== FUNCTIONs =====
class hit:
    """ a hit from a blast tabular output file """
    def __repr__(self):
        return self.line

def parseBlast(file):
    hits = []
    for l in file:
        if l.startswith("#"):
            continue
        fs = l.split()
        if len(fs)!=12:
            stderr.write("error: blast output does not have 12 fields. Make sure that you ran blast with -m 9 or -D T (for bl2seq)\n")
            exit(1)
        h = hit()
        (h.qName, h.sName, h.percId, h.alnLen, h.mismatches, h.gapOpen, h.qStart, h.qEnd, h.sStart, h.sEnd, h.eVal, h.alnScore) = fs 
        h.sStart = int(h.sStart)
        h.sEnd = int(h.sEnd)
        h.qStart = int(h.qStart)
        h.qEnd = int(h.qEnd)
        if h.qEnd < h.qStart:
            (h.qStart, h.qEnd) = (h.qEnd, h.qStart)
        if h.sEnd < h.sStart:
            (h.sStart, h.sEnd) = (h.sEnd, h.sStart)
        h.line = l.strip()
        hits.append(h)
    return hits

class bed:
    """ a feature from a bed file """
    def overlaps(self, hit):
        """ returns number of overlapping bp if two Features overlap or 
            false if no overlap """
        if (( hit.sStart <= self.start and hit.sEnd > self.start) or \
            (hit.sStart < self.end and hit.sEnd >= self.end) or 
            (hit.sStart > self.start and hit.sStart < self.end)):
            return (min (hit.sEnd, self.end) - max (hit.sStart, self.start))
        else:
            return 0 
    def __repr__(self):
        return self.line

def parseBed(file):
    beds = []
    for l in file:
        l = l.strip()
        b = bed()
        if l.startswith("#"):
            continue
        fs = l.split()
        if not len(fs) >= 4:
            stderr.write("error: bed features need names.\n")
            exit(1)
        (b.chrom, b.start, b.end, b.name) = fs[0:4]
        b.start = int(b.start)
        b.end = int(b.end)
        b.line = l
        beds.append(b)
    return beds
        
# ----------- MAIN --------------

if args==[]: 
    parser.print_help()
    exit(1)

stderr.write("Reading blast output...\n")
f = open(args[0], "r")
hits = parseBlast(f)stderr.write("Searching for overlaps bed-blast...\n")
for b in beds:
    overlaps = []
    for h in hits:
        length = b.overlaps(h)
        if length > 0:
            h.overlapLen=length
            overlaps.append(h)
    # choose highest scoring overlap
    if len(overlaps)==0:
        continue
    overlaps.sort(key=lambda x: x.alnScore, reverse=True)
    #for o in overlaps:
        #print o
    best = overlaps[0]
    print best.qName, best.qStart, best.qEnd, b.name+"_ints"+str(best.overlapLen)


stderr.write("Reading bedfile ...\n")
f = open(args[1], "r")
beds = parseBed(f)


        h.qStart = int(h.qStart)
        h.qEnd = int(h.qEnd)
        if h.qEnd < h.qStart:
            (h.qStart, h.qEnd) = (h.qEnd, h.qStart)
        if h.sEnd < h.sStart:
            (h.sStart, h.sEnd) = (h.sEnd, h.sStart)
        h.line = l.strip()
        hits.append(h)
    return hits