#!/usr/bin/python import sys from optparse import OptionParser import glob, os.path import namedtuple, tabfile, logging from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from Bio.Alphabet import DNAAlphabet # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = OptionParser("usage: %prog [options] inDir outDir - parse genbank to tables: recordInfo, references, referenceLinks") parser.add_option("-s", "--startAuthorId", dest="startAuthorId", action="store", type="int", help="first author id (for running on a cluster)", default=0) parser.add_option("", "--bigTable", dest="bigTable", action="store_true", help="Output only one big table") parser.add_option("", "--onlySubmitter", dest="onlySubmitter", action="store_true", help="try to identify the submitter, as the last reference (or second last, if title is direct submission and submitting last author name is part of second last reference author list)") parser.add_option("", "--seqList", dest="seqList", action="store", help="FILTER: read accession ids from file and process only records with an id from this file, will process only the part of the accession up to the first dot", metavar="FILENAME") parser.add_option("", "--taxonList", dest="taxonList", action="store", help="FILTER: read list of organisms from file, process only records if organism is in file", metavar="FILENAME") parser.add_option("", "--seqMaxLen", dest="seqMaxLen", action="store", type="int", help="FILTER: maximum length of sequence in record", metavar="SIZE", default=100000000000000) parser.add_option("-f", "--fasta", dest="fasta", action="store", help="write fasta sequences to this directory, filename: prefix.species.fa", metavar="DIRECTORY") parser.add_option("", "--bySpecies", dest="bySpecies", action="store_true", help="split fasta sequences into several files, one file per species") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="activate debug output", metavar="FASTAFILE") (options, args) = parser.parse_args() # ==== FUNCTIONs ===== # RECORDS RecordInfo = namedtuple.namedtuple("RecordInfo", "gi, accession, description, year, seqLen, organism, taxonId, seqVersion, clone, clone_lib, cell_line, cell_type, dev_stage, tissue_type, organelle, mol_type, isolate") RefInfo = namedtuple.namedtuple("ReferenceInfo", "pmid, medlineId, authors, title, journal, comment") FusedRecordInfo = namedtuple.namedtuple("fusedRecordInfo", RecordInfo._fields + RefInfo._fields) def outputToFile(asBigTable, fileStem, records, refMap, giRefMap): if not asBigTable: tabfile.writeList ( fileStem+".records", records) refMap.writeToFile ( fileStem+".refs") giRefMap.writeToFile ( fileStem+".recordToRefs") else: tabfile.writeList ( fileStem+".genbankTable", records) def fuseRecords(recInfo, refInfos): """ create new records, fusing recInfo and all refInfos into new fusedRecordInfos """ newRecs = [] for ref in refInfos: recFields = list(recInfo) recFields.extend(ref) fusedRec = FusedRecordInfo(recFields) newRecs.append(fusedRec) return newRecs def getRecordInfo(seq): # parse record itself annot = seq.annotations #accs = annot["accessions"] id = seq.id gi = annot["gi"] seqLen = len(seq.seq) desc = seq.description #xrefs = seq.dbxrefs date = annot["date"] year = date.split("-")[2] # parse from source feature (the first one) clone, clone_lib, cell_line, cell_type, taxonId, dev_stage, tissue_type = "", "", "", "", "", "", "" if seq.features[0].type=="source": srcFt = seq.features[0] quals = srcFt.qualifiers xrefs = quals.get("db_xref", []) taxonRefs = [x for x in xrefs if x.startswith("taxon:")] if len(taxonRefs)==0: logging.debug("%s: no taxon id" % id) taxonId="noTaxonId:" elif len(taxonRefs)==1: taxonId = taxonRefs[0] else: logging.info("%s: more than one taxonId" % id) taxonId = taxonRefs[0] taxonId=taxonId.split(":")[1] clone = quals.get("clone",[""])[0] clone_lib = quals.get("clone_lib",[""])[0] cell_line = quals.get("cell_line",[""])[0] dev_stage = quals.get("dev_stage",[""])[0] tissue_type = quals.get("tissue_type",[""])[0] organelle = quals.get("organelle",[""])[0] mol_type = quals.get("mol_type",[""])[0] isolate = quals.get("isolate",[""])[0] data = RecordInfo(str(gi), id, desc, year, str(seqLen), annot["organism"], taxonId, str(annot["sequence_version"]), clone, clone_lib, cell_line, cell_type, dev_stage, tissue_type, organelle, mol_type, isolate) #print "\t".join(data) return data class RefMap: """ a bag of reference strings of which each has a unique id, that starts from startRefid """ def __init__(self, startRefId): self.refIds = {} self.nextRefId = startRefId def getRefId(self, refInfo): refString = "\t".join(refInfo) if refString not in self.refIds: thisRefId = self.nextRefId self.refIds[refString] = thisRefId self.nextRefId+=1 else: thisRefId=self.refIds[refString] return thisRefId def writeToFile(self, fname): sys.stderr.write("Writing references to %s\n" % fname) fh = open(fname, "w") for author, authorId in self.refIds.iteritems(): fh.write(str(authorId)+"\t"+str(author)+"\n") fh.close() class GiRefMap: """ a map between references, gis and their index in them """ def __init__(self): self.giRefMap = [] def add(self, gi, refId, refIdx): self.giRefMap.append( (gi, refId, refIdx) ) def writeToFile(self, fname): sys.stderr.write("Writing record <-> ref links to %s\n" % fname) fh = open(fname, "w") for gi, authorId, index in self.giRefMap: fh.write(str(gi)+"\t"+str(authorId)+"\t"+str(index)+"\n") fh.close() def getRefInfo(ref): authorData = RefInfo(ref.pubmed_id, ref.medline_id, ref.authors, ref.title, ref.journal, ref.comment) return authorData def findSubmitRef(refs): """ try to identify the submitting reference and return as string: if there are more than one, check if the last one is direct submission. In this case, return the preceding one ONLY if it is a) the only one or b) it contains the family name of the submitter. """ if len(refs)==0: sys.stderr.write("NoReferenceFound") return None elif len(refs)==1: return "\t".join(refs[0]) else: lastRef = refs[-1] if lastRef.title.lower()=="direct submission": secondLastRef = refs[-2] lastFamName = lastRef.authors.split(",")[0] if len(refs)==2 or lastFamName.lower() in secondLastRef.authors.lower(): return "\t".join(secondLastRef)+"\t".join(lastRef) else: return "submitterLastnameNotInSecondPublication" else: return "\t".join(lastRef) # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) inDir = args[0] outDir = args[1] startAuthorId = options.startAuthorId onlySubmitter = options.onlySubmitter seqListFilename = options.seqList fastaDir = options.fasta debug = options.debug seqMaxLen = options.seqMaxLen taxonFilename = options.taxonList bySpecies = options.bySpecies asBigTable = options.bigTable if debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if seqListFilename: logging.info("Reading filter accession IDs") filterIds = set([x.strip() for x in open(seqListFilename).readlines()]) else: filterIds = None if taxonFilename: logging.info("Reading filter filter organism names") taxonNames = set([x.strip() for x in open(taxonFilename).readlines()]) else: taxonNames = None if os.path.isdir(inDir): filenames = glob.glob(os.path.join(inDir, "*.seq")) else: filenames = [inDir] for filename in filenames: # file opening logging.info("Parsing %s\n" % filename) basename = os.path.splitext(os.path.basename(filename))[0] input_handle = open(filename) try: sequences = SeqIO.parse(input_handle, "genbank") except: sys.stderr.write("Parsing error, skipping this file") continue # all our data is stored here refMap = RefMap(startAuthorId) giRefMap = GiRefMap() records = [] submitInfos = [] # only needed for onlySubmitter-mode seqDict = {} # sequences indexed by species for seq in sequences: id = seq.id.split(".")[0] # process filters if filterIds and id not in filterIds: logging.debug("skipping %s, not in id file" % id) continue if taxonNames and seq.annotations["organism"] not in taxonNames: logging.debug("skipping %s, not in taxon file" % id) continue if len(seq.seq) > seqMaxLen: logging.debug("skipping %s, too long" % id) continue # save record info recInfo = getRecordInfo(seq) records.append(recInfo) # add reference information to the two collection classes refIdx = 0 refs = [] for ref in seq.annotations["references"]: refIdx+=1 refInfo = getRefInfo(ref) if onlySubmitter: refs.append(refInfo) else: refId = refMap.getRefId(refInfo) giRefMap.add(recInfo.gi, refId, refIdx) if onlySubmitter: submitRef = findSubmitRef(refs) submitInfos.append( [recInfo.accession, submitRef] ) # HACK: for bigTableOutputFormat: remove record, fuse with refs, add back # to records if asBigTable: recInfo = records.pop() bigRecords = fuseRecords(recInfo, refs) records.extend(bigRecords) if fastaDir: onlySeq = SeqRecord(Seq(str(seq.seq), DNAAlphabet), id=seq.id, description="") seqDict.setdefault(recInfo.organism, []).append(onlySeq) # output to file fileStem = os.path.join(outDir, basename) if onlySubmitter: sys.stderr.write("Writing submitter info\n") tabfile.writeList(fileStem+".submitterInfo", submitInfos) else: if len(records)!=0: outputToFile(asBigTable, fileStem, records, refMap, giRefMap) if fastaDir: if bySpecies: for species, seqs in seqDict.iteritems(): species = species.replace(" ","_") species = species.replace("/","_") filename = os.path.join(fastaDir,basename+"."+species+".fa") print("Writing fasta %s" % filename) fastaFh = open(filename, "w") SeqIO.write(seqs, fastaFh, "fasta") else: seqs = [] for species, specSeqs in seqDict.iteritems(): seqs.extend(specSeqs) filename = os.path.join(fastaDir,basename+".fa") print("Writing fasta %s" % filename) fastaFh = open(filename, "w") SeqIO.write(seqs, fastaFh, "fasta")