BedBlastLift
The big guru has spoken some time ago and he said that tblastx is a very sensitive alignment tool, more than blastn or blat. You cannot run it on the whole genome, but you might give it a try on smaller sequences. However, blastToPsl cannot parse tblastx-output, as tblastx does not return real alignments (not sure here, but there seem to exist variable sized gaps "*" in the output), and without psl, not chain, no chain -> no liftover.
We can still simply take everything that overlaps an alignable region and map it to the WHOLE alignable region on the second sequence so as to give at least a hint where it should be roughly located. This is what the following scripts does. It needs the bed file from sequence1 and the tblastx output with option "-m 9" (tabular format), with the aligning of sequence1 on sequence2.
#!/usr/bin/python from sys import * from optparse import OptionParser # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = OptionParser("%prog [options] bedfile blastfile: Use a tblastx-tabular-file to lift annotations between sequences (approximatively), very slow") #parser.add_option("-l", "--wordlength", dest="motiflength", action="store", help="length of word [default: %default, hexamers]", type="int", metavar="NUMBER", default="6") #parser.add_option("-m", "--maf", dest="maf", action="store_true", help="force maf format [default: %default]", default=True) #parser.add_option("-f", "--fasta", dest="maf", action="store_false", help="force fasta format (if not specified, we assume UCSC .maf file format") (options, args) = parser.parse_args() # ==== FUNCTIONs ===== class hit: """ a hit from a blast tabular output file """ def __repr__(self): return self.line def parseBlast(file): hits = [] for l in file: if l.startswith("#"): continue fs = l.split() if len(fs)!=12: stderr.write("error: blast output does not have 12 fields. Make sure that you ran blast with -m 9 or -D T (for bl2seq)\n") exit(1) h = hit() (h.qName, h.sName, h.percId, h.alnLen, h.mismatches, h.gapOpen, h.qStart, h.qEnd, h.sStart, h.sEnd, h.eVal, h.alnScore) = fs h.sStart = int(h.sStart) h.sEnd = int(h.sEnd) h.qStart = int(h.qStart) h.qEnd = int(h.qEnd) if h.qEnd < h.qStart: (h.qStart, h.qEnd) = (h.qEnd, h.qStart) if h.sEnd < h.sStart: (h.sStart, h.sEnd) = (h.sEnd, h.sStart) h.line = l.strip() hits.append(h) return hits class bed: """ a feature from a bed file """ def overlaps(self, hit): """ returns number of overlapping bp if two Features overlap or false if no overlap """ if (( hit.sStart <= self.start and hit.sEnd > self.start) or \ (hit.sStart < self.end and hit.sEnd >= self.end) or (hit.sStart > self.start and hit.sStart < self.end)): return (min (hit.sEnd, self.end) - max (hit.sStart, self.start)) else: return 0 def __repr__(self): return self.line def parseBed(file): beds = [] for l in file: l = l.strip() b = bed() if l.startswith("#"): continue fs = l.split() if not len(fs) >= 4: stderr.write("error: bed features need names.\n") exit(1) (b.chrom, b.start, b.end, b.name) = fs[0:4] b.start = int(b.start) b.end = int(b.end) b.line = l beds.append(b) return beds # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) stderr.write("Reading blast output...\n") f = open(args[0], "r") hits = parseBlast(f)stderr.write("Searching for overlaps bed-blast...\n") for b in beds: overlaps = [] for h in hits: length = b.overlaps(h) if length > 0: h.overlapLen=length overlaps.append(h) # choose highest scoring overlap if len(overlaps)==0: continue overlaps.sort(key=lambda x: x.alnScore, reverse=True) #for o in overlaps: #print o best = overlaps[0] print best.qName, best.qStart, best.qEnd, b.name+"_ints"+str(best.overlapLen) stderr.write("Reading bedfile ...\n") f = open(args[1], "r") beds = parseBed(f) h.qStart = int(h.qStart) h.qEnd = int(h.qEnd) if h.qEnd < h.qStart: (h.qStart, h.qEnd) = (h.qEnd, h.qStart) if h.sEnd < h.sStart: (h.sStart, h.sEnd) = (h.sEnd, h.sStart) h.line = l.strip() hits.append(h) return hits