BedBlastLift
The big guru has spoken some time ago and he said that tblastx is a very sensitive alignment tool, more than blastn or blat. You cannot run it on the whole genome, but you might give it a try on smaller sequences. However, blastToPsl cannot parse tblastx-output, as tblastx does not return real alignments (not sure here, but there seem to exist variable sized gaps "*" in the output), and without psl, not chain, no chain -> no liftover.
We can still simply take everything that overlaps an alignable region and map it to the WHOLE alignable region on the second sequence so as to give at least a hint where it should be roughly located. This is what the following scripts does. It needs the bed file from sequence1 and the tblastx output with option "-m 9" (tabular format), with the aligning of sequence1 on sequence2. It will then project the bed file from sequence1 to the BEST hit from sequence2 and annotate the length of the intersection between alignable region and original annotation on sequence1.
#!/usr/bin/python from sys import * from optparse import OptionParser # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = OptionParser("%prog [options] bedfile blastfile: Use a tblastx-tabular-file to lift annotations between sequences (approximatively), very slow") #parser.add_option("-l", "--wordlength", dest="motiflength", action="store", help="length of word [default: %default, hexamers]", type="int", metavar="NUMBER", default="6") #parser.add_option("-m", "--maf", dest="maf", action="store_true", help="force maf format [default: %default]", default=True) #parser.add_option("-f", "--fasta", dest="maf", action="store_false", help="force fasta format (if not specified, we assume UCSC .maf file format") (options, args) = parser.parse_args() # ==== FUNCTIONs ===== class hit: """ a hit from a blast tabular output file """ def __repr__(self): return self.line def parseBlast(file): hits = [] for l in file: if l.startswith("#"): continue fs = l.split() if len(fs)!=12: stderr.write("error: blast output does not have 12 fields. Make sure that you ran blast with -m 9 or -D T (for bl2seq)\n") exit(1) h = hit() (h.qName, h.sName, h.percId, h.alnLen, h.mismatches, h.gapOpen, h.qStart, h.qEnd, h.sStart, h.sEnd, h.eVal, h.alnScore) = fs h.sStart = int(h.sStart) h.sEnd = int(h.sEnd) h.qStart = int(h.qStart) h.qEnd = int(h.qEnd) if h.qEnd < h.qStart: (h.qStart, h.qEnd) = (h.qEnd, h.qStart) if h.sEnd < h.sStart: (h.sStart, h.sEnd) = (h.sEnd, h.sStart) h.line = l.strip() hits.append(h) return hits class bed: """ a feature from a bed file """ def overlaps(self, hit): """ returns number of overlapping bp if two Features overlap or false if no overlap """ if (( hit.sStart <= self.start and hit.sEnd > self.start) or \ (hit.sStart < self.end and hit.sEnd >= self.end) or (hit.sStart > self.start and hit.sStart < self.end)): return (min (hit.sEnd, self.end) - max (hit.sStart, self.start)) else: return 0 def __repr__(self): return self.line def parseBed(file): beds = [] for l in file: l = l.strip() b = bed() if l.startswith("#"): continue fs = l.split() if not len(fs) >= 4: stderr.write("error: bed features need names.\n") exit(1) (b.chrom, b.start, b.end, b.name) = fs[0:4] b.start = int(b.start) b.end = int(b.end) b.line = l beds.append(b) return beds # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) stderr.write("Reading blast output...\n") f = open(args[0], "r") hits = parseBlast(f) stderr.write("Reading bedfile ...\n") f = open(args[1], "r") beds = parseBed(f) stderr.write("Searching for overlaps bed-blast...\n") for b in beds: overlaps = [] for h in hits: length = b.overlaps(h) if length > 0: h.overlapLen=length overlaps.append(h) # choose highest scoring overlap if len(overlaps)==0: continue overlaps.sort(key=lambda x: x.alnScore, reverse=True) #for o in overlaps: #print o best = overlaps[0] print best.qName, best.qStart, best.qEnd, b.name+"_ints"+str(best.overlapLen)