#!/bin/bash # # Same Species Lift Over procedure, the first step: run.blat procedure # query and target sequences are chunked up into smaller pieces # each query chunk is run with blat against each target chunk # the result of this is a number of psl output files which are # chained together in the run.chain second procedure. # ASSUMPTION HERE ! - this is for simple small genome sequences, # small as in several 10's of million bases. Larger genome sequences # require a cluster computer to run 10's of thousands of blat jobs. # and some different parameters on the chunking operation, not # listed here. # Buyer Beware - this was written without being tested, it may have errors. # The general theme and procedure is correct, some specifics may be off. # exit on any error set -beEu -o pipefail # kent source programs required: # twoBitInfo twoBitToFa partitionSequence.pl gensub2 blat calc # use 2bit files for your target and query sequences: # e.g.: ${targetDb}.2bit and ${queryDb}.2bit # 2bit files are the most convenient format for this type of work. ####################################################################### # First part of the procedure is a blat run between target and query chunks # decide on a work-directory where this will take place: export workDirectory="/some/path/to/work/area" # two directories will be created here where work-steps take place: mkdir -p ${workDirectory}/run.blat ${workDirectory}/run.chain # the resulting liftOver chain file will end up in ${workDirectory}/ # location where your two .2bit files are located: export twoBitDirectory="/some/path/to/two/bit/files" ####################################################################### # targetDb is the sequence you have good coordinates you want to convert export targetDb="myTargetSequence" # typical chunk size for a target sequence is 10,000,000 bases export targetChunkSize="10000000" # no need to overlap chunks, either target or query: export chunkOverlapSize="0" # allow only a single sequence to exist in a single chunk group export chunkCountLimit="1" # queryDb is the sequence you want to convert coordinates from the target # to the query export queryDb="myQuerySequence" # typical chunk size for a query sequence is also 10,000,000 bases export queryChunkSize="10000000" ####################################################################### # Compute partitions (coordinate ranges) for cluster jobs. # working in the run.blat directory cd ${workDirectory}/run.blat # directory tParts will be created, make sure it is clean rm -fr tParts twoBitInfo ${twoBitDirectory}/${targetDb}.2bit stdout | sort -k2nr > ${targetDb}.chrom.sizes # check the usage message for this command in the kent source tree # to see what all the arguments mean: $HOME/kent/src/hg/utils/automation/partitionSequence.pl ${targetChunkSize} \ ${chunkOverlapSize} ${twoBitDirectory}/${targetDb}.2bit ${targetDb}.chrom.sizes \ ${chunkCountLimit} \ -lstDir=tParts > t.lst # output t.lst is the listing of the target chunks to work with # same procedure for the query bits: rm -fr qParts twoBitInfo ${twoBitDirectory}/${queryDb}.2bit stdout | sort -k2nr > ${queryDb}.chrom.sizes # check the usage message for this command in the kent source tree # to see what all the arguments mean: $HOME/kent/src/hg/utils/automation/partitionSequence.pl ${queryChunkSize} \ ${chunkOverlapSize} ${twoBitDirectory}/${queryDb}.2bit ${queryDb}.chrom.sizes \ ${chunkCountLimit} \ -lstDir=qParts > q.lst # output q.lst is the listing of the query chunks to work with ####################################################################### # Construct the 11.ooc file for the target sequence, assuming cd ${workDirectory}/run.blat # the printout of the faSize is a line something like: # 23332831 bases (10 N's 23332821 real 18400480 upper 4932341 lower) in 16 sequences in 1 files # use the fifth field there, the 23332821 "real" number of bases: export realSize=`twoBitToFa ${twoBitDirectory}/${targetDb}.2bit stdout | faSize stdin | grep real | awk '{print $5}'` # calculate repMatch based on a proportion of the hg19 genoe size export repMatch=`calc \( ${realSize} / 2897316137 \) \* 1024` # That gives a really small number for small genomes, ignore that result # and simply use a repMatch of 100: blat ${twoBitDirectory}/${targetDb}.2bit /dev/null \ /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=100 ####################################################################### # Construct the result output PSL directories cd ${workDirectory}/run.blat mkdir ${workDirectory}/run.blat/psl for F in `cat t.lst` do B=`basename ${F} mkdir ${workDirectory}/run.blat/psl/${B} done ####################################################################### # construct gensub2 template file # ( allow only ${workDirectory} to shell expand here, the other $ variables # are for gensub2 use, hence \$ to keep them literal ) # the "blatJob.csh" script can be found at: cat << EOF > template #LOOP ./blatJob.csh \$(path1) \$(path2) ${workDirectory}/run.blat/psl/\$(file1)/\$(file2).psl #ENDLOOP EOF ## construct jobList for each query chunk blat to each target chunk: gensub2 t.lst q.lst template jobList # the resulting jobList is a listing of commands to be run which will perform # the blat on each specified target/query chunk pair of sequences # With the blatJob.csh in place in this run.blat directory, you can simply # run each job in the jobList: # chmod +x ./jobList # ./jobList > jobs.log 2>&1 #######################################################################